aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristudasan Devadasan <Christudasan.Devadasan@amd.com>2024-06-20 10:21:32 +0000
committerChristudasan Devadasan <Christudasan.Devadasan@amd.com>2024-06-20 10:22:20 +0000
commit0dde26d55af778d7b43a3daac1bd36029db0a918 (patch)
tree1c49ae1eb41d52cfec5d20b299f7e2a4c2fd94e3
parent65eb44327cf32a83dbbf13eb70f9d8c03f3efaef (diff)
downloadllvm-0dde26d55af778d7b43a3daac1bd36029db0a918.zip
llvm-0dde26d55af778d7b43a3daac1bd36029db0a918.tar.gz
llvm-0dde26d55af778d7b43a3daac1bd36029db0a918.tar.bz2
[AMDGPU] Codegen support for constrained multi-dword sloadsusers/cdevadas/enable-codegen-for-constrained-sloads
For targets that support xnack replay feature (gfx8+), the multi-dword scalar loads shouldn't clobber any register that holds the src address. The constraint version of the scalar loads have the early clobber flag attached to the dst operand to restrict RA from re-allocating any of the src regs for its dst operand.
-rw-r--r--llvm/lib/Target/AMDGPU/SMInstructions.td116
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll244
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll234
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir12
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir72
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll405
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll203
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll42
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll180
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll128
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll66
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll164
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll278
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll527
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll292
-rw-r--r--llvm/test/CodeGen/AMDGPU/add.ll545
-rw-r--r--llvm/test/CodeGen/AMDGPU/add.v2i16.ll268
-rw-r--r--llvm/test/CodeGen/AMDGPU/amd.endpgm.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll1225
-rw-r--r--llvm/test/CodeGen/AMDGPU/and.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/anyext.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll524
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll2005
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll2155
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll456
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll526
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll56
-rw-r--r--llvm/test/CodeGen/AMDGPU/bfe-combine.ll56
-rw-r--r--llvm/test/CodeGen/AMDGPU/bfe-patterns.ll72
-rw-r--r--llvm/test/CodeGen/AMDGPU/bfi_int.ll136
-rw-r--r--llvm/test/CodeGen/AMDGPU/bfm.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/bitreverse.ll140
-rw-r--r--llvm/test/CodeGen/AMDGPU/br_cc.f16.ll74
-rw-r--r--llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/bswap.ll156
-rw-r--r--llvm/test/CodeGen/AMDGPU/build_vector.ll44
-rw-r--r--llvm/test/CodeGen/AMDGPU/calling-conventions.ll324
-rw-r--r--llvm/test/CodeGen/AMDGPU/carryout-selection.ll850
-rw-r--r--llvm/test/CodeGen/AMDGPU/clamp-modifier.ll418
-rw-r--r--llvm/test/CodeGen/AMDGPU/clamp.ll1334
-rw-r--r--llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll257
-rw-r--r--llvm/test/CodeGen/AMDGPU/copy_to_scc.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctlz.ll54
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll86
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctpop16.ll52
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctpop64.ll76
-rw-r--r--llvm/test/CodeGen/AMDGPU/cttz.ll32
-rw-r--r--llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll48
-rw-r--r--llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll28
-rw-r--r--llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll305
-rw-r--r--llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll82
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds-alignment.ll270
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds_write2.ll46
-rw-r--r--llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll122
-rw-r--r--llvm/test/CodeGen/AMDGPU/fabs.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/fadd.f16.ll422
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll722
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcmp.f16.ll638
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll415
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll356
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll310
-rw-r--r--llvm/test/CodeGen/AMDGPU/fdiv.f16.ll398
-rw-r--r--llvm/test/CodeGen/AMDGPU/fdiv.ll107
-rw-r--r--llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll184
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat_atomics.ll336
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll120
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll2234
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll324
-rw-r--r--llvm/test/CodeGen/AMDGPU/fma-combine.ll162
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmax3.ll448
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll40
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmaximum.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmed3.ll2790
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmin3.ll664
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll80
-rw-r--r--llvm/test/CodeGen/AMDGPU/fminimum.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmul.f16.ll342
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll752
-rw-r--r--llvm/test/CodeGen/AMDGPU/fnearbyint.ll90
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll52
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll84
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg-fabs.ll48
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg.ll274
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp-classify.ll282
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll100
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll36
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll262
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll56
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll28
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp_to_sint.ll126
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp_to_uint.ll104
-rw-r--r--llvm/test/CodeGen/AMDGPU/fpext.f16.ll586
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptosi.f16.ll266
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptoui.f16.ll266
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll972
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptrunc.ll812
-rw-r--r--llvm/test/CodeGen/AMDGPU/frem.ll196
-rw-r--r--llvm/test/CodeGen/AMDGPU/fshl.ll120
-rw-r--r--llvm/test/CodeGen/AMDGPU/fshr.ll48
-rw-r--r--llvm/test/CodeGen/AMDGPU/fsub.f16.ll252
-rw-r--r--llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll42
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics.ll650
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll180
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll2644
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll564
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll2044
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll1188
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll1188
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll1612
-rw-r--r--llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/idiv-licm.ll507
-rw-r--r--llvm/test/CodeGen/AMDGPU/idot2.ll367
-rw-r--r--llvm/test/CodeGen/AMDGPU/idot4s.ll607
-rw-r--r--llvm/test/CodeGen/AMDGPU/idot4u.ll865
-rw-r--r--llvm/test/CodeGen/AMDGPU/idot8s.ll258
-rw-r--r--llvm/test/CodeGen/AMDGPU/idot8u.ll259
-rw-r--r--llvm/test/CodeGen/AMDGPU/imm.ll642
-rw-r--r--llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll442
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll96
-rw-r--r--llvm/test/CodeGen/AMDGPU/kernel-args.ll564
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll380
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll124
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll1072
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll1860
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll37
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll32
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll28
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll27
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll736
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll1268
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll66
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll224
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll192
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll100
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll150
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll310
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll310
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll334
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll96
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll214
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll596
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll120
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll56
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.exp.ll209
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.exp10.ll209
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.exp2.ll41
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll120
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll536
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll42
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll32
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll90
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.log.ll124
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.log10.ll124
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.log2.ll156
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll524
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll520
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.mulo.ll424
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll38
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll100
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.round.ll426
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll56
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll80
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll80
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-f64.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i1.ll3494
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i16.ll1583
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i32.ll413
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i64.ll88
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i8.ll2533
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-global-i16.ll582
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-global-i32.ll448
-rw-r--r--llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll78
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll9
-rw-r--r--llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll140
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad.u16.ll48
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad_64_32.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/madak.ll76
-rw-r--r--llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll420
-rw-r--r--llvm/test/CodeGen/AMDGPU/max.i16.ll48
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory_clause.ll25
-rw-r--r--llvm/test/CodeGen/AMDGPU/min.ll132
-rw-r--r--llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll34
-rw-r--r--llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll60
-rw-r--r--llvm/test/CodeGen/AMDGPU/mul.ll1510
-rw-r--r--llvm/test/CodeGen/AMDGPU/mul_int24.ll94
-rw-r--r--llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll146
-rw-r--r--llvm/test/CodeGen/AMDGPU/offset-split-flat.ll890
-rw-r--r--llvm/test/CodeGen/AMDGPU/offset-split-global.ll722
-rw-r--r--llvm/test/CodeGen/AMDGPU/omod.ll88
-rw-r--r--llvm/test/CodeGen/AMDGPU/optimize-compare.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/or.ll270
-rw-r--r--llvm/test/CodeGen/AMDGPU/packed-op-sel.ll64
-rw-r--r--llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/preload-kernargs.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll52
-rw-r--r--llvm/test/CodeGen/AMDGPU/rcp-pattern.ll122
-rw-r--r--llvm/test/CodeGen/AMDGPU/rotl.ll52
-rw-r--r--llvm/test/CodeGen/AMDGPU/rotr.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/saddo.ll258
-rw-r--r--llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll52
-rw-r--r--llvm/test/CodeGen/AMDGPU/sdiv.ll278
-rw-r--r--llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll370
-rw-r--r--llvm/test/CodeGen/AMDGPU/select.f16.ll630
-rw-r--r--llvm/test/CodeGen/AMDGPU/shl.ll386
-rw-r--r--llvm/test/CodeGen/AMDGPU/shl.v2i16.ll140
-rw-r--r--llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll1364
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/sign_extend.ll184
-rw-r--r--llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll354
-rw-r--r--llvm/test/CodeGen/AMDGPU/sitofp.f16.ll182
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll5170
-rw-r--r--llvm/test/CodeGen/AMDGPU/sra.ll274
-rw-r--r--llvm/test/CodeGen/AMDGPU/srl.ll70
-rw-r--r--llvm/test/CodeGen/AMDGPU/sub.ll236
-rw-r--r--llvm/test/CodeGen/AMDGPU/sub.v2i16.ll304
-rw-r--r--llvm/test/CodeGen/AMDGPU/trap-abis.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/trunc-combine.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/uaddo.ll178
-rw-r--r--llvm/test/CodeGen/AMDGPU/udiv.ll230
-rw-r--r--llvm/test/CodeGen/AMDGPU/udivrem.ll146
-rw-r--r--llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll284
-rw-r--r--llvm/test/CodeGen/AMDGPU/uitofp.f16.ll182
-rw-r--r--llvm/test/CodeGen/AMDGPU/uniform-cfg.ll150
-rw-r--r--llvm/test/CodeGen/AMDGPU/usubo.ll178
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_cndmask.ll250
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_madak_f16.ll56
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_pack.ll82
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll90
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/wave32.ll282
-rw-r--r--llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll90
-rw-r--r--llvm/test/CodeGen/AMDGPU/xor.ll174
-rw-r--r--llvm/test/CodeGen/AMDGPU/zero_extend.ll2
265 files changed, 43685 insertions, 43512 deletions
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 4551a3a..9fbedce 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -867,13 +867,104 @@ def SMRDBufferImm : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">;
def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">;
def SMRDBufferSgprImm : ComplexPattern<iPTR, 2, "SelectSMRDBufferSgprImm">;
+class SMRDAlignedLoadPat<PatFrag Op> : PatFrag <(ops node:$ptr), (Op node:$ptr), [{
+ // Returns true if it is a naturally aligned multi-dword load.
+ LoadSDNode *Ld = cast<LoadSDNode>(N);
+ unsigned Size = Ld->getMemoryVT().getStoreSize();
+ return (Size <= 4) || (Ld->getAlign().value() >= PowerOf2Ceil(Size));
+}]> {
+ let GISelPredicateCode = [{
+ auto &Ld = cast<GLoad>(MI);
+ TypeSize Size = Ld.getMMO().getSize().getValue();
+ return (Size <= 4) || (Ld.getMMO().getAlign().value() >= PowerOf2Ceil(Size));
+ }];
+}
+
+class SMRDUnalignedLoadPat<PatFrag Op> : PatFrag <(ops node:$ptr), (Op node:$ptr), [{
+ // Returns true if it is an under aligned multi-dword load.
+ LoadSDNode *Ld = cast<LoadSDNode>(N);
+ unsigned Size = Ld->getMemoryVT().getStoreSize();
+ return (Size > 4) && (Ld->getAlign().value() < PowerOf2Ceil(Size));
+}]> {
+ let GISelPredicateCode = [{
+ auto &Ld = cast<GLoad>(MI);
+ TypeSize Size = Ld.getMMO().getSize().getValue();
+ return (Size > 4) && (Ld.getMMO().getAlign().value() < PowerOf2Ceil(Size));
+ }];
+}
+
+def alignedmultidwordload : SMRDAlignedLoadPat<smrd_load>;
+def unalignedmultidwordload : SMRDUnalignedLoadPat<smrd_load>;
+
+multiclass SMRD_Align_Pattern <string Instr, ValueType vt> {
+
+ // 1. IMM offset
+ def : GCNPat <
+ (alignedmultidwordload (SMRDImm i64:$sbase, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))> {
+ let OtherPredicates = [isGFX8Plus];
+ }
+ def : GCNPat <
+ (unalignedmultidwordload (SMRDImm i64:$sbase, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM_ec") $sbase, $offset, 0))> {
+ let OtherPredicates = [isGFX8Plus];
+ }
+
+ // 2. SGPR offset
+ def : GCNPat <
+ (alignedmultidwordload (SMRDSgpr i64:$sbase, i32:$soffset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $soffset, 0))> {
+ let OtherPredicates = [isGFX8Only];
+ }
+ def : GCNPat <
+ (unalignedmultidwordload (SMRDSgpr i64:$sbase, i32:$soffset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_ec") $sbase, $soffset, 0))> {
+ let OtherPredicates = [isGFX8Only];
+ }
+ def : GCNPat <
+ (alignedmultidwordload (SMRDSgpr i64:$sbase, i32:$soffset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))> {
+ let OtherPredicates = [isGFX9Plus];
+ }
+ def : GCNPat <
+ (unalignedmultidwordload (SMRDSgpr i64:$sbase, i32:$soffset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM_ec") $sbase, $soffset, 0, 0))> {
+ let OtherPredicates = [isGFX9Plus];
+ }
+
+ // 3. SGPR+IMM offset
+ def : GCNPat <
+ (alignedmultidwordload (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))> {
+ let OtherPredicates = [isGFX9Plus];
+ }
+ def : GCNPat <
+ (unalignedmultidwordload (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM_ec") $sbase, $soffset, $offset, 0))> {
+ let OtherPredicates = [isGFX9Plus];
+ }
+
+ // 4. No offset
+ def : GCNPat <
+ (vt (alignedmultidwordload (i64 SReg_64:$sbase))),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))> {
+ let OtherPredicates = [isGFX8Plus];
+ }
+ def : GCNPat <
+ (vt (unalignedmultidwordload (i64 SReg_64:$sbase))),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM_ec") i64:$sbase, 0, 0))> {
+ let OtherPredicates = [isGFX8Plus];
+ }
+}
+
multiclass SMRD_Pattern <string Instr, ValueType vt, bit immci = true> {
// 1. IMM offset
def : GCNPat <
(smrd_load (SMRDImm i64:$sbase, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))
- >;
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))> {
+ let OtherPredicates = [isGFX6GFX7];
+ }
// 2. 32-bit IMM offset on CI
if immci then def : GCNPat <
@@ -886,26 +977,17 @@ multiclass SMRD_Pattern <string Instr, ValueType vt, bit immci = true> {
def : GCNPat <
(smrd_load (SMRDSgpr i64:$sbase, i32:$soffset)),
(vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $soffset, 0))> {
- let OtherPredicates = [isNotGFX9Plus];
- }
- def : GCNPat <
- (smrd_load (SMRDSgpr i64:$sbase, i32:$soffset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))> {
- let OtherPredicates = [isGFX9Plus];
+ let OtherPredicates = [isGFX6GFX7];
}
- // 4. SGPR+IMM offset
+ // 4. No offset
def : GCNPat <
- (smrd_load (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))> {
- let OtherPredicates = [isGFX9Plus];
+ (vt (smrd_load (i64 SReg_64:$sbase))),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))> {
+ let OtherPredicates = [isGFX6GFX7];
}
- // 5. No offset
- def : GCNPat <
- (vt (smrd_load (i64 SReg_64:$sbase))),
- (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))
- >;
+ defm : SMRD_Align_Pattern<Instr, vt>;
}
multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
index a38b6e3..9a8672d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
@@ -7,11 +7,11 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s6, s0
-; GFX11-NEXT: s_addc_u32 s1, s7, s1
+; GFX11-NEXT: s_add_u32 s0, s6, s2
+; GFX11-NEXT: s_addc_u32 s1, s7, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -23,10 +23,10 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[2:3]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -59,11 +59,11 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_u32 s0, s6, s0
-; GFX11-NEXT: s_subb_u32 s1, s7, s1
+; GFX11-NEXT: s_sub_u32 s0, s6, s2
+; GFX11-NEXT: s_subb_u32 s1, s7, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -75,10 +75,10 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[6:7], s[0:1]
+; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[6:7], s[2:3]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
index bb5ccc3..57a8bbb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
@@ -113,9 +113,9 @@ bb1:
define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) {
; WAVE64-LABEL: brcond_sgpr_trunc_and:
; WAVE64: ; %bb.0: ; %entry
-; WAVE64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; WAVE64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; WAVE64-NEXT: s_waitcnt lgkmcnt(0)
-; WAVE64-NEXT: s_and_b32 s0, s0, s1
+; WAVE64-NEXT: s_and_b32 s0, s2, s3
; WAVE64-NEXT: s_xor_b32 s0, s0, 1
; WAVE64-NEXT: s_and_b32 s0, s0, 1
; WAVE64-NEXT: s_cmp_lg_u32 s0, 0
@@ -131,9 +131,9 @@ define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) {
;
; WAVE32-LABEL: brcond_sgpr_trunc_and:
; WAVE32: ; %bb.0: ; %entry
-; WAVE32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; WAVE32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; WAVE32-NEXT: s_waitcnt lgkmcnt(0)
-; WAVE32-NEXT: s_and_b32 s0, s0, s1
+; WAVE32-NEXT: s_and_b32 s0, s2, s3
; WAVE32-NEXT: s_xor_b32 s0, s0, 1
; WAVE32-NEXT: s_and_b32 s0, s0, 1
; WAVE32-NEXT: s_cmp_lg_u32 s0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index 3f034ea..9cabe0c0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -1400,11 +1400,11 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: cvt_ubyte0_or_multiuse:
; VI: ; %bb.0: ; %bb
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
@@ -1412,8 +1412,8 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr
; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; VI-NEXT: v_add_f32_e32 v2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
index a018ea5..ce0d9c3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
@@ -27,10 +27,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat:
; GFX940: ; %bb.0:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -43,10 +43,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
; GFX940: ; %bb.0:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 4e94a64..081e257 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -1021,20 +1021,20 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) {
; GFX90A-LABEL: global_atomic_fadd_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1044,20 +1044,20 @@ main_body:
define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) {
; GFX90A-LABEL: global_atomic_fmin_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fmin_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX940-NEXT: global_atomic_min_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1067,20 +1067,20 @@ main_body:
define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) {
; GFX90A-LABEL: global_atomic_fmax_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fmax_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX940-NEXT: global_atomic_max_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1090,21 +1090,21 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[2:3], exec
-; GFX90A-NEXT: s_mov_b32 s4, s3
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], exec
+; GFX90A-NEXT: s_mov_b32 s2, s5
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB39_3
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
@@ -1112,14 +1112,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB39_2
; GFX90A-NEXT: .LBB39_3:
; GFX90A-NEXT: s_endpgm
@@ -1134,14 +1134,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB39_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: .LBB39_2:
@@ -1162,13 +1162,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB40_2
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: .LBB40_2:
@@ -1184,14 +1184,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB40_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: .LBB40_2:
@@ -1204,21 +1204,21 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[2:3], exec
-; GFX90A-NEXT: s_mov_b32 s4, s3
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], exec
+; GFX90A-NEXT: s_mov_b32 s2, s5
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB41_3
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
@@ -1226,14 +1226,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB41_2
; GFX90A-NEXT: .LBB41_3:
; GFX90A-NEXT: s_endpgm
@@ -1248,14 +1248,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB41_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: .LBB41_2:
@@ -1276,13 +1276,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB42_2
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: .LBB42_2:
@@ -1298,14 +1298,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB42_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: .LBB42_2:
@@ -1480,34 +1480,34 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[2:3], exec
-; GFX90A-NEXT: s_mov_b32 s4, s3
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], exec
+; GFX90A-NEXT: s_mov_b32 s2, s5
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB49_3
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: .LBB49_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB49_2
; GFX90A-NEXT: .LBB49_3:
; GFX90A-NEXT: s_endpgm
@@ -1522,14 +1522,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB49_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: .LBB49_2:
@@ -1542,11 +1542,11 @@ main_body:
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
-; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1566,10 +1566,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1583,11 +1583,11 @@ main_body:
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
@@ -1595,10 +1595,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1612,11 +1612,11 @@ main_body:
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
-; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1637,10 +1637,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -1761,19 +1761,19 @@ main_body:
define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_endpgm
main_body:
@@ -1803,11 +1803,11 @@ main_body:
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
-; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1825,10 +1825,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1842,19 +1842,19 @@ main_body:
define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) {
; GFX90A-LABEL: flat_atomic_fmin_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: flat_atomic_fmin_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_endpgm
main_body:
@@ -1884,19 +1884,19 @@ main_body:
define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) {
; GFX90A-LABEL: flat_atomic_fmax_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: flat_atomic_fmax_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_endpgm
main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index 05cdb54..4635db9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -38,20 +38,20 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-LABEL: frem_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x8
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; VI-NEXT: v_cvt_f32_f16_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
+; VI-NEXT: v_cvt_f32_f16_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_rcp_f32_e32 v2, v2
; VI-NEXT: v_mul_f32_e32 v0, v0, v2
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
+; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s0
; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: v_fma_f16 v2, -v0, v1, s2
+; VI-NEXT: v_fma_f16 v2, -v0, v1, s0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_short v[0:1], v2
@@ -88,16 +88,16 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; VI-LABEL: fast_frem_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x8
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_rcp_f16_e32 v0, s0
-; VI-NEXT: v_mul_f16_e32 v0, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_rcp_f16_e32 v0, s1
+; VI-NEXT: v_mul_f16_e32 v0, s0, v0
; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: v_fma_f16 v2, -v0, s0, v1
+; VI-NEXT: v_fma_f16 v2, -v0, s1, v1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_short v[0:1], v2
@@ -134,16 +134,16 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; VI-LABEL: unsafe_frem_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x8
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_rcp_f16_e32 v0, s0
-; VI-NEXT: v_mul_f16_e32 v0, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_rcp_f16_e32 v0, s1
+; VI-NEXT: v_mul_f16_e32 v0, s0, v0
; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: v_fma_f16 v2, -v0, s0, v1
+; VI-NEXT: v_fma_f16 v2, -v0, s1, v1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_short v[0:1], v2
@@ -189,14 +189,14 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-LABEL: frem_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x10
+; VI-NEXT: s_load_dword s6, s[6:7], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x10
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s2
-; VI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2
+; VI-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s6
+; VI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6
; VI-NEXT: v_rcp_f32_e32 v3, v1
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
@@ -207,9 +207,9 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
-; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s2
+; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s6
; VI-NEXT: v_trunc_f32_e32 v1, v1
-; VI-NEXT: v_fma_f32 v2, -v1, v0, s2
+; VI-NEXT: v_fma_f32 v2, -v1, v0, s6
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -244,16 +244,16 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
; VI-LABEL: fast_frem_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x10
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x10
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_rcp_f32_e32 v0, s0
-; VI-NEXT: v_mul_f32_e32 v0, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_rcp_f32_e32 v0, s1
+; VI-NEXT: v_mul_f32_e32 v0, s0, v0
; VI-NEXT: v_trunc_f32_e32 v0, v0
-; VI-NEXT: v_fma_f32 v2, -v0, s0, v1
+; VI-NEXT: v_fma_f32 v2, -v0, s1, v1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -288,16 +288,16 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
; VI-LABEL: unsafe_frem_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x10
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x10
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_rcp_f32_e32 v0, s0
-; VI-NEXT: v_mul_f32_e32 v0, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_rcp_f32_e32 v0, s1
+; VI-NEXT: v_mul_f32_e32 v0, s0, v0
; VI-NEXT: v_trunc_f32_e32 v0, v0
-; VI-NEXT: v_fma_f32 v2, -v0, s0, v1
+; VI-NEXT: v_fma_f32 v2, -v0, s1, v1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -342,15 +342,15 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-LABEL: frem_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], s[2:3]
-; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[0:1]
+; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1]
; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
@@ -359,9 +359,9 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3]
+; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[0:1]
; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
-; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3]
+; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1]
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -402,23 +402,23 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
; VI-LABEL: fast_frem_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f64_e32 v[0:1], s[0:1]
-; VI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
+; VI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3]
+; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; VI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
+; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1]
-; VI-NEXT: v_fma_f64 v[6:7], -s[0:1], v[4:5], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mul_f64 v[4:5], s[0:1], v[0:1]
+; VI-NEXT: v_fma_f64 v[6:7], -s[2:3], v[4:5], v[2:3]
; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
-; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
+; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -459,23 +459,23 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
; VI-LABEL: unsafe_frem_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f64_e32 v[0:1], s[0:1]
-; VI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
+; VI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3]
+; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; VI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
+; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1]
-; VI-NEXT: v_fma_f64 v[6:7], -s[0:1], v[4:5], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mul_f64 v[4:5], s[0:1], v[0:1]
+; VI-NEXT: v_fma_f64 v[6:7], -s[2:3], v[4:5], v[2:3]
; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
-; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
+; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -546,31 +546,31 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: frem_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x10
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x10
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; VI-NEXT: v_cvt_f32_f16_e32 v2, s0
-; VI-NEXT: s_lshr_b32 s3, s0, 16
+; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
+; VI-NEXT: v_cvt_f32_f16_e32 v2, s1
+; VI-NEXT: s_lshr_b32 s3, s1, 16
; VI-NEXT: v_cvt_f32_f16_e32 v3, s3
-; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_rcp_f32_e32 v2, v2
-; VI-NEXT: s_lshr_b32 s1, s2, 16
+; VI-NEXT: s_lshr_b32 s2, s0, 16
; VI-NEXT: v_rcp_f32_e32 v3, v3
; VI-NEXT: v_mul_f32_e32 v0, v0, v2
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
+; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s0
; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: v_fma_f16 v0, -v0, v1, s2
-; VI-NEXT: v_cvt_f32_f16_e32 v1, s1
+; VI-NEXT: v_fma_f16 v0, -v0, v1, s0
+; VI-NEXT: v_cvt_f32_f16_e32 v1, s2
; VI-NEXT: v_mul_f32_e32 v1, v1, v3
; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s1
+; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s2
; VI-NEXT: v_trunc_f16_e32 v1, v1
-; VI-NEXT: v_fma_f16 v1, -v1, v2, s1
+; VI-NEXT: v_fma_f16 v1, -v1, v2, s2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v2, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s4
@@ -683,47 +683,47 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: frem_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x20
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; VI-NEXT: v_cvt_f32_f16_e32 v2, s0
-; VI-NEXT: s_lshr_b32 s8, s0, 16
+; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
+; VI-NEXT: v_cvt_f32_f16_e32 v2, s2
+; VI-NEXT: s_lshr_b32 s8, s2, 16
; VI-NEXT: v_cvt_f32_f16_e32 v3, s8
-; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_rcp_f32_e32 v2, v2
-; VI-NEXT: s_lshr_b32 s6, s2, 16
+; VI-NEXT: s_lshr_b32 s6, s0, 16
; VI-NEXT: v_rcp_f32_e32 v3, v3
-; VI-NEXT: v_cvt_f32_f16_e32 v4, s1
+; VI-NEXT: v_cvt_f32_f16_e32 v4, s3
; VI-NEXT: v_mul_f32_e32 v0, v0, v2
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: v_mov_b32_e32 v2, s8
; VI-NEXT: v_rcp_f32_e32 v4, v4
-; VI-NEXT: s_lshr_b32 s9, s1, 16
-; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
+; VI-NEXT: s_lshr_b32 s9, s3, 16
+; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s0
; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: v_fma_f16 v0, -v0, v1, s2
+; VI-NEXT: v_fma_f16 v0, -v0, v1, s0
; VI-NEXT: v_cvt_f32_f16_e32 v1, s6
; VI-NEXT: v_cvt_f32_f16_e32 v5, s9
-; VI-NEXT: s_lshr_b32 s7, s3, 16
+; VI-NEXT: s_lshr_b32 s7, s1, 16
; VI-NEXT: v_mul_f32_e32 v1, v1, v3
; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_rcp_f32_e32 v5, v5
; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s6
; VI-NEXT: v_trunc_f16_e32 v1, v1
; VI-NEXT: v_fma_f16 v1, -v1, v2, s6
-; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
+; VI-NEXT: v_cvt_f32_f16_e32 v2, s1
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_mul_f32_e32 v2, v2, v4
; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
; VI-NEXT: v_mov_b32_e32 v4, s9
-; VI-NEXT: v_div_fixup_f16 v2, v2, v3, s3
+; VI-NEXT: v_div_fixup_f16 v2, v2, v3, s1
; VI-NEXT: v_trunc_f16_e32 v2, v2
-; VI-NEXT: v_fma_f16 v2, -v2, v3, s3
+; VI-NEXT: v_fma_f16 v2, -v2, v3, s1
; VI-NEXT: v_cvt_f32_f16_e32 v3, s7
; VI-NEXT: v_mul_f32_e32 v3, v3, v5
; VI-NEXT: v_cvt_f16_f32_e32 v3, v3
@@ -793,14 +793,14 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: frem_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x20
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s2
-; VI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s0
+; VI-NEXT: v_div_scale_f32 v2, vcc, s0, v0, s0
; VI-NEXT: v_rcp_f32_e32 v3, v1
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
@@ -811,12 +811,12 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
-; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s2
+; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s0
; VI-NEXT: v_trunc_f32_e32 v1, v1
-; VI-NEXT: v_fma_f32 v0, -v1, v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, s3
-; VI-NEXT: v_div_scale_f32 v3, vcc, s3, v1, s3
+; VI-NEXT: v_fma_f32 v0, -v1, v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, s1
+; VI-NEXT: v_div_scale_f32 v3, vcc, s1, v1, s1
; VI-NEXT: v_rcp_f32_e32 v4, v2
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; VI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
@@ -827,9 +827,9 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_fma_f32 v2, -v2, v5, v3
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; VI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
-; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s3
+; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s1
; VI-NEXT: v_trunc_f32_e32 v2, v2
-; VI-NEXT: v_fma_f32 v1, -v2, v1, s3
+; VI-NEXT: v_fma_f32 v1, -v2, v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
index 5d48168..83a85c7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
@@ -16,9 +16,9 @@ body: |
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
+ ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY %3.sub0_sub1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY %3.sub2_sub3
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
@@ -64,9 +64,9 @@ body: |
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
+ ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY %3.sub0_sub1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY %3.sub2_sub3
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
index c444772..7bbce45 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
@@ -167,15 +167,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<2 x s32>), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (<2 x s32>), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1 = COPY %1
;
; GFX10-LABEL: name: load_constant_v2s32_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<2 x s32>), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (<2 x s32>), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 4, addrspace 4)
$sgpr0_sgpr1 = COPY %1
@@ -210,15 +210,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<4 x s16>), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (<4 x s16>), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1 = COPY %1
;
; GFX10-LABEL: name: load_constant_v4s16_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<4 x s16>), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (<4 x s16>), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 4, addrspace 4)
$sgpr0_sgpr1 = COPY %1
@@ -254,15 +254,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<4 x s32>), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<4 x s32>), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
;
; GFX10-LABEL: name: load_constant_v4s32_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<4 x s32>), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<4 x s32>), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 4, addrspace 4)
$sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
@@ -342,15 +342,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (s64), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (s64), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1 = COPY %1
;
; GFX10-LABEL: name: load_constant_s64_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (s64), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (s64), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(s64) = G_LOAD %0 :: (load (s64), align 4, addrspace 4)
$sgpr0_sgpr1 = COPY %1
@@ -386,15 +386,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<2 x s64>), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<2 x s64>), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
;
; GFX10-LABEL: name: load_constant_v2s64
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<2 x s64>), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<2 x s64>), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 4, addrspace 4)
$sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
@@ -782,15 +782,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
;
; GFX10-LABEL: name: load_constant_v8s16
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 4, addrspace 4)
$sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
@@ -826,15 +826,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (load (<8 x s32>), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY [[S_LOAD_DWORDX8_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s32>), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %1
;
; GFX10-LABEL: name: load_constant_v8s32
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (load (<8 x s32>), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY [[S_LOAD_DWORDX8_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s32>), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>), align 4, addrspace 4)
$sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %1
@@ -870,15 +870,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<16 x s32>), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sgpr_512 = S_LOAD_DWORDX16_IMM_ec [[COPY]], 0, 0 :: (load (<16 x s32>), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
;
; GFX10-LABEL: name: load_constant_v16s32
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<16 x s32>), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sgpr_512 = S_LOAD_DWORDX16_IMM_ec [[COPY]], 0, 0 :: (load (<16 x s32>), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(<16 x s32>) = G_LOAD %0 :: (load (<16 x s32>), align 4, addrspace 4)
$sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
@@ -914,15 +914,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<8 x s64>), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sgpr_512 = S_LOAD_DWORDX16_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s64>), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
;
; GFX10-LABEL: name: load_constant_v8s64
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<8 x s64>), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sgpr_512 = S_LOAD_DWORDX16_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s64>), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(<8 x s64>) = G_LOAD %0 :: (load (<8 x s64>), align 4, addrspace 4)
$sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll
index 7587aa0..2a725ec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll
@@ -17,7 +17,7 @@ define amdgpu_kernel void @load_zeroinit_lds_global(ptr addrspace(1) %out, i1 %p
; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @lds
; GFX6: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[S_MOV_B32_1]], [[S_MOV_B32_]], implicit-def dead $scc
; GFX6: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 9, 0
- ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0
+ ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 36, 0
; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_U32_]]
; GCN: $m0 = S_MOV_B32 -1
; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
index 1a49a38..4671f60 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
@@ -27,11 +27,11 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: test_div_scale_f32_1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
@@ -40,38 +40,38 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_load_dword v1, v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v1
+; GFX10-NEXT: v_div_scale_f32 v0, s0, v2, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -110,11 +110,11 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: test_div_scale_f32_2:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
@@ -123,38 +123,38 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_load_dword v1, v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s2, v1, v2, v1
+; GFX10-NEXT: v_div_scale_f32 v0, s0, v1, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f32 v0, null, v1, v0, v1
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -195,7 +195,6 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
@@ -208,8 +207,10 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -217,30 +218,32 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1]
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3] offset:8 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[2:3], v[2:3], v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -281,7 +284,6 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
@@ -294,8 +296,10 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -303,30 +307,32 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1]
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3] offset:8 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[2:3], v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -661,7 +667,7 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out
; GFX8-LABEL: test_div_scale_f64_scalar_num_1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s6
@@ -672,33 +678,35 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[0:1]
+; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[2:3]
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f64_scalar_num_1:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[0:1], s[0:1]
+; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[0:1], s[2:3]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_scalar_num_1:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x54
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[0:1], s[0:1]
+; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[0:1], s[2:3]
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -736,7 +744,7 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out
; GFX8-LABEL: test_div_scale_f64_scalar_num_2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s6
@@ -747,33 +755,35 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], v[0:1], s[0:1]
+; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[2:3], v[0:1], s[2:3]
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f64_scalar_num_2:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], v[0:1], s[0:1]
+; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[2:3], v[0:1], s[2:3]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_scalar_num_2:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x54
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], v[0:1], s[0:1]
+; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[2:3], v[0:1], s[2:3]
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -811,7 +821,7 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out
; GFX8-LABEL: test_div_scale_f64_scalar_den_1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s6
@@ -822,33 +832,35 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1]
+; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[2:3], s[2:3], v[0:1]
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f64_scalar_den_1:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], s[0:1], v[0:1]
+; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[2:3], s[2:3], v[0:1]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_scalar_den_1:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x54
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], v[0:1]
+; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[2:3], s[2:3], v[0:1]
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -886,7 +898,7 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out
; GFX8-LABEL: test_div_scale_f64_scalar_den_2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s6
@@ -897,33 +909,35 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[0:1], v[0:1]
+; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[2:3], v[0:1]
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f64_scalar_den_2:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], s[0:1], v[0:1]
+; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], s[2:3], v[0:1]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_scalar_den_2:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x54
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], s[0:1], v[0:1]
+; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], s[2:3], v[0:1]
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -957,12 +971,13 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(ptr addrspace(1) %out
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[0:1], 0x70
; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -981,13 +996,13 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(ptr addrspace(1) %out
; GFX11-LABEL: test_div_scale_f32_all_scalar_1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c
-; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x4c
+; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x70
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_div_scale_f32 v0, null, s5, s5, s4
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1015,12 +1030,13 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(ptr addrspace(1) %out
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[0:1], 0x70
; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s3, v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1039,13 +1055,13 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(ptr addrspace(1) %out
; GFX11-LABEL: test_div_scale_f32_all_scalar_2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c
-; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x4c
+; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x70
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s3, s2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s5, s4
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1074,13 +1090,14 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x74
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -1090,22 +1107,24 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], s[2:3]
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_all_scalar_1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x74
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[4:5], s[4:5], s[2:3]
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1134,13 +1153,14 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x74
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[4:5], v[0:1], s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -1150,22 +1170,24 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[2:3], s[4:5], s[2:3]
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_all_scalar_2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x74
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[2:3], s[4:5], s[2:3]
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1195,42 +1217,42 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o
;
; GFX8-LABEL: test_div_scale_f32_inline_imm_num:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, 1.0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_inline_imm_num:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, 1.0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, v0, 1.0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_inline_imm_num:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, 1.0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1264,42 +1286,42 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o
;
; GFX8-LABEL: test_div_scale_f32_inline_imm_den:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], 2.0, 2.0, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], 2.0, 2.0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_inline_imm_den:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s2, 2.0, 2.0, v0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: v_div_scale_f32 v0, s0, 2.0, 2.0, v0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_inline_imm_den:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f32 v0, null, 2.0, 2.0, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1336,11 +1358,11 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: test_div_scale_f32_fabs_num:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1] glc
@@ -1350,41 +1372,41 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt
; GFX8-NEXT: flat_load_dword v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v1, 0x7fffffff, v2
-; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_fabs_num:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: v_div_scale_f32 v0, s0, v2, v2, v0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_fabs_num:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1426,11 +1448,11 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: test_div_scale_f32_fabs_den:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
@@ -1440,41 +1462,41 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt
; GFX8-NEXT: flat_load_dword v1, v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
-; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_fabs_den:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v2
-; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1
+; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, v0, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_fabs_den:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1509,29 +1531,30 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %ou
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, 0x41000000
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s0, s0, v0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_val_undef_val:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 null, 0
+; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, s0, 0x41000000
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, 0x41000000
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_val_undef_val:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, 0x41000000
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, 0x41000000
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1557,29 +1580,30 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %ou
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, 0x41000000
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_undef_val_val:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 null, 0
+; GFX10-NEXT: v_div_scale_f32 v0, s0, 0x41000000, 0x41000000, s0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s2, 0x41000000, 0x41000000, s0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_undef_val_val:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: v_div_scale_f32 v0, null, 0x41000000, 0x41000000, s0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_div_scale_f32 v0, null, 0x41000000, 0x41000000, s0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1603,29 +1627,30 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) %
; GFX8-LABEL: test_div_scale_f32_undef_undef_val:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s0, s0, s0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_undef_undef_val:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 null, 0
+; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, s0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, s0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_undef_undef_val:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, s0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, s0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1653,29 +1678,29 @@ define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %ou
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0x40200000
; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[0:1], s[0:1], v[0:1]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f64_val_undef_val:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], 0x40200000
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_val_undef_val:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], 0x40200000
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index 87d0d71..a4aea63 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -843,75 +843,47 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
}
define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
-; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_clause 0x1
-; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
-; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX1030-NEXT: v_mov_b32_e32 v3, 0
-; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0
-; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0
-; GFX1030-NEXT: v_mov_b32_e32 v6, 0x40400000
-; GFX1030-NEXT: v_mov_b32_e32 v7, 4.0
-; GFX1030-NEXT: v_mov_b32_e32 v8, 0x40a00000
-; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40c00000
-; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000
-; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000
-; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX1030-NEXT: flat_load_dword v2, v[0:1]
-; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7
-; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102
-; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
-; GFX1030-NEXT: s_waitcnt vmcnt(0)
-; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
-; GFX1030-NEXT: s_endpgm
-;
-; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign:
-; GFX1013: ; %bb.0:
-; GFX1013-NEXT: s_clause 0x1
-; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX1013-NEXT: v_mov_b32_e32 v3, 0
-; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0
-; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0
-; GFX1013-NEXT: v_mov_b32_e32 v6, 0x40400000
-; GFX1013-NEXT: v_mov_b32_e32 v7, 4.0
-; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40a00000
-; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40c00000
-; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000
-; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000
-; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT: v_mov_b32_e32 v0, s2
-; GFX1013-NEXT: v_mov_b32_e32 v1, s3
-; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX1013-NEXT: flat_load_dword v2, v[0:1]
-; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7
-; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102
-; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[4:7]
-; GFX1013-NEXT: s_waitcnt vmcnt(0)
-; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
-; GFX1013-NEXT: s_endpgm
+; GFX10-LABEL: image_bvh64_intersect_ray_nsa_reassign:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX10-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX10-NEXT: v_mov_b32_e32 v5, 2.0
+; GFX10-NEXT: v_mov_b32_e32 v6, 0x40400000
+; GFX10-NEXT: v_mov_b32_e32 v7, 4.0
+; GFX10-NEXT: v_mov_b32_e32 v8, 0x40a00000
+; GFX10-NEXT: v_mov_b32_e32 v9, 0x40c00000
+; GFX10-NEXT: v_mov_b32_e32 v10, 0x40e00000
+; GFX10-NEXT: v_mov_b32_e32 v11, 0x41000000
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: flat_load_dword v2, v[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xb36211c7
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x102
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[4:7]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
+; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX11-NEXT: s_mov_b32 s16, 0xb36211c7
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_movk_i32 s17, 0x102
+; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_mov_b32 s8, 0x40400000
; GFX11-NEXT: s_mov_b32 s12, 0x40c00000
-; GFX11-NEXT: s_mov_b32 s6, 2.0
+; GFX11-NEXT: s_mov_b32 s1, 1.0
; GFX11-NEXT: s_mov_b32 s10, 0x40a00000
; GFX11-NEXT: s_mov_b32 s9, 4.0
; GFX11-NEXT: s_mov_b32 s14, 0x41000000
@@ -921,18 +893,18 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9
; GFX11-NEXT: v_mov_b32_e32 v7, s13
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s4
-; GFX11-NEXT: v_mov_b32_e32 v1, s5
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_mov_b32 s5, 1.0
+; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-NEXT: s_mov_b32 s2, 2.0
; GFX11-NEXT: v_mov_b32_e32 v10, s17
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v11, v[0:1]
-; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-NEXT: v_mov_b32_e32 v2, s6
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[0:2], v[3:5], v[6:8]], s[0:3]
+; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[0:2], v[3:5], v[6:8]], s[4:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_endpgm
@@ -954,84 +926,59 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
}
define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
-; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_clause 0x1
-; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
-; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX1030-NEXT: v_mov_b32_e32 v3, 0
-; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0
-; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0
-; GFX1030-NEXT: v_mov_b32_e32 v6, 0x44004200
-; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500
-; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700
-; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX1030-NEXT: flat_load_dword v2, v[0:1]
-; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6
-; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102
-; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
-; GFX1030-NEXT: s_waitcnt vmcnt(0)
-; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
-; GFX1030-NEXT: s_endpgm
-;
-; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
-; GFX1013: ; %bb.0:
-; GFX1013-NEXT: s_clause 0x1
-; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX1013-NEXT: v_mov_b32_e32 v3, 0
-; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0
-; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0
-; GFX1013-NEXT: v_mov_b32_e32 v6, 0x44004200
-; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500
-; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700
-; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT: v_mov_b32_e32 v0, s2
-; GFX1013-NEXT: v_mov_b32_e32 v1, s3
-; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX1013-NEXT: flat_load_dword v2, v[0:1]
-; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6
-; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102
-; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16
-; GFX1013-NEXT: s_waitcnt vmcnt(0)
-; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
-; GFX1013-NEXT: s_endpgm
+; GFX10-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX10-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX10-NEXT: v_mov_b32_e32 v5, 2.0
+; GFX10-NEXT: v_mov_b32_e32 v6, 0x44004200
+; GFX10-NEXT: v_mov_b32_e32 v7, 0x46004500
+; GFX10-NEXT: v_mov_b32_e32 v8, 0x48004700
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: flat_load_dword v2, v[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xb36211c6
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x102
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
+; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX11-NEXT: s_mov_b32 s12, 0xb36211c6
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_movk_i32 s13, 0x102
-; GFX11-NEXT: s_mov_b32 s6, 2.0
+; GFX11-NEXT: s_mov_b32 s1, 1.0
+; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_mov_b32 s8, 0x42004600
; GFX11-NEXT: s_mov_b32 s9, 0x44004700
; GFX11-NEXT: s_mov_b32 s10, 0x45004800
; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s4
-; GFX11-NEXT: v_mov_b32_e32 v1, s5
-; GFX11-NEXT: s_mov_b32 s5, 1.0
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-NEXT: s_mov_b32 s2, 2.0
; GFX11-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v7, s13
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v8, v[0:1]
-; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-NEXT: v_mov_b32_e32 v2, s6
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[0:3] a16
+; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[4:7] a16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
index e7faabb..66d1f5f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
@@ -353,8 +353,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN-NEXT: s_mov_b32 s4, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GCN-NEXT: s_mov_b32 s5, 0x405ec000
; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@@ -369,7 +369,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GCN-NEXT: v_accvgpr_write_b32 a5, s9
; GCN-NEXT: v_accvgpr_write_b32 a6, s10
; GCN-NEXT: v_accvgpr_write_b32 a7, s11
-; GCN-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
; GCN-NEXT: v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
index c0cd068..f712df2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
@@ -8,14 +8,14 @@
define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) {
; GFX8-LABEL: dpp_test:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -34,12 +34,12 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) {
; GFX11-LABEL: dpp_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; encoding: [0x80,0x00,0x00,0xf4,0x2c,0x00,0x00,0xf8]
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; encoding: [0x00,0x00,0x04,0xf4,0x24,0x00,0x00,0xf8]
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c ; encoding: [0x00,0x01,0x00,0xf4,0x2c,0x00,0x00,0xf8]
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; encoding: [0x80,0x00,0x04,0xf4,0x24,0x00,0x00,0xf8]
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; encoding: [0x80,0x00,0x10,0xca,0x02,0x00,0x00,0x01]
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; encoding: [0x80,0x00,0x10,0xca,0x04,0x00,0x00,0x01]
; GFX11-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; encoding: [0x00,0x00,0x6a,0xdc,0x01,0x00,0x00,0x00]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; encoding: [0x00,0x00,0x6a,0xdc,0x01,0x00,0x02,0x00]
; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
@@ -50,38 +50,38 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) {
define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) {
; GFX8-LABEL: mov_dpp64_test:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: mov_dpp64_test:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; encoding: [0x00,0x00,0x08,0xf4,0x24,0x00,0x00,0xfa]
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; encoding: [0x00,0x01,0x08,0xf4,0x24,0x00,0x00,0xfa]
; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e]
; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
-; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; encoding: [0x03,0x02,0x02,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, s6 ; encoding: [0x06,0x02,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v1, s7 ; encoding: [0x07,0x02,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x00,0x11]
; GFX10-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11]
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x00,0x00]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x04,0x00]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; GFX11-LABEL: mov_dpp64_test:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; encoding: [0x00,0x00,0x08,0xf4,0x24,0x00,0x00,0xf8]
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; encoding: [0x00,0x01,0x08,0xf4,0x24,0x00,0x00,0xf8]
; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e]
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; encoding: [0x02,0x00,0x10,0xca,0x03,0x00,0x00,0x00]
+; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; encoding: [0x06,0x00,0x10,0xca,0x07,0x00,0x00,0x00]
; GFX11-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x00,0x11]
; GFX11-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11]
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; encoding: [0x00,0x00,0x6e,0xdc,0x02,0x00,0x00,0x00]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; encoding: [0x00,0x00,0x6e,0xdc,0x02,0x00,0x04,0x00]
; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index fa24489..3d352db 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -4,16 +4,16 @@
define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
; GCN-LABEL: set_inactive:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 42
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
store i32 %tmp, ptr addrspace(1) %out
@@ -23,17 +23,17 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
; GCN-LABEL: set_inactive_64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
store i64 %tmp, ptr addrspace(1) %out
@@ -47,32 +47,32 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_buffer_load_dword s2, s[4:7], 0x0
; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s2, 56
-; GCN-NEXT: s_cselect_b32 s4, 1, 0
+; GCN-NEXT: s_cselect_b32 s1, 1, 0
; GCN-NEXT: v_mov_b32_e32 v0, s3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 42
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s2, 1
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
+; GCN-NEXT: s_mov_b32 s0, 1
+; GCN-NEXT: s_cmp_lg_u32 s1, 0
; GCN-NEXT: s_cbranch_scc0 .LBB2_2
; GCN-NEXT: ; %bb.1: ; %.one
; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
-; GCN-NEXT: s_mov_b32 s2, 0
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: .LBB2_2: ; %Flow
-; GCN-NEXT: s_xor_b32 s2, s2, 1
-; GCN-NEXT: s_and_b32 s2, s2, 1
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_xor_b32 s0, s0, 1
+; GCN-NEXT: s_and_b32 s0, s0, 1
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_cbranch_scc1 .LBB2_4
; GCN-NEXT: ; %bb.3: ; %.zero
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: .LBB2_4: ; %.exit
; GCN-NEXT: s_endpgm
%val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0)
@@ -96,17 +96,17 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
; GCN-LABEL: set_inactive_f32:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
store float %tmp, ptr addrspace(1) %out
@@ -116,21 +116,21 @@ define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
; GCN-LABEL: set_inactive_f64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s4, 0xcccccccd
-; GCN-NEXT: s_mov_b32 s5, 0x4010cccc
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s0, 0xcccccccd
+; GCN-NEXT: s_mov_b32 s1, 0x4010cccc
+; GCN-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: v_mov_b32_e32 v1, v3
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
store double %tmp, ptr addrspace(1) %out
@@ -140,17 +140,17 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
; GCN-LABEL: set_inactive_v2i16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0x10001
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
store <2 x i16> %tmp, ptr addrspace(1) %out
@@ -160,17 +160,17 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
; GCN-LABEL: set_inactive_v2f16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0x3c003c00
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
store <2 x half> %tmp, ptr addrspace(1) %out
@@ -228,17 +228,17 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
; GCN-LABEL: set_inactive_v2bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0x3f803f80
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
store <2 x bfloat> %tmp, ptr addrspace(1) %out
@@ -320,17 +320,17 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa
define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
; GCN-LABEL: set_inactive_p0:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
store ptr %tmp, ptr addrspace(1) %out
@@ -340,16 +340,16 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
; GCN-LABEL: set_inactive_p2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
store ptr addrspace(2) %tmp, ptr addrspace(1) %out
@@ -359,16 +359,16 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
; GCN-LABEL: set_inactive_p3:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
store ptr addrspace(3) %tmp, ptr addrspace(1) %out
@@ -378,16 +378,16 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
; GCN-LABEL: set_inactive_p5:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
store ptr addrspace(5) %tmp, ptr addrspace(1) %out
@@ -397,16 +397,16 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
; GCN-LABEL: set_inactive_p6:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
store ptr addrspace(6) %tmp, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
index d628270..7f720e5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
@@ -46,47 +46,47 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) {
; GFX8-LABEL: update_dppi64_test:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: update_dppi64_test:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX10-NEXT: v_mov_b32_e32 v2, s2
-; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v2, s6
+; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dppi64_test:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -101,47 +101,47 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i
define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1, double %in2) {
; GFX8-LABEL: update_dppf64_test:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: update_dppf64_test:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX10-NEXT: v_mov_b32_e32 v2, s2
-; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v2, s6
+; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dppf64_test:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -266,47 +266,47 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, ptr %in2) {
; GFX8-LABEL: update_dpp_p0_test:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: update_dpp_p0_test:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX10-NEXT: v_mov_b32_e32 v2, s2
-; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v2, s6
+; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dpp_p0_test:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -321,13 +321,13 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p
define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspace(3) %in1, ptr %in2) {
; GFX8-LABEL: update_dpp_p3_test:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: ds_read_b32 v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
@@ -336,11 +336,11 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa
;
; GFX10-LABEL: update_dpp_p3_test:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX10-NEXT: v_mov_b32_e32 v2, s1
+; GFX10-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX10-NEXT: v_mov_b32_e32 v2, s3
; GFX10-NEXT: ds_read_b32 v1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
@@ -349,11 +349,11 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa
;
; GFX11-LABEL: update_dpp_p3_test:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX11-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, s3
; GFX11-NEXT: ds_load_b32 v1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
@@ -371,17 +371,17 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa
; GFX8-LABEL: update_dpp_p5_test:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
; GFX8-NEXT: s_mov_b32 s90, -1
; GFX8-NEXT: s_mov_b32 s91, 0xe80000
; GFX8-NEXT: s_add_u32 s88, s88, s3
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_addc_u32 s89, s89, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
@@ -390,17 +390,17 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa
;
; GFX10-LABEL: update_dpp_p5_test:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
; GFX10-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_mov_b32 s7, 0x31c16000
; GFX10-NEXT: s_add_u32 s4, s4, s3
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_addc_u32 s5, s5, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX10-NEXT: v_mov_b32_e32 v2, s1
+; GFX10-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX10-NEXT: v_mov_b32_e32 v2, s3
; GFX10-NEXT: buffer_load_dword v1, v0, s[4:7], 0 offen
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
@@ -409,11 +409,11 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa
;
; GFX11-LABEL: update_dpp_p5_test:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX11-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, s3
; GFX11-NEXT: scratch_load_b32 v1, v0, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index 6bb1043..9251f26 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -1017,7 +1017,9 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg %ptr) {
; GFX12-LABEL: s_load_constant_v3i32_align4:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
;
@@ -1054,7 +1056,9 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg
define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
; GFX12-LABEL: s_load_constant_i96_align8:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
;
@@ -1091,7 +1095,9 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg %ptr) {
; GFX12-LABEL: s_load_constant_v3i32_align8:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
;
@@ -1128,7 +1134,9 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg
define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg %ptr) {
; GFX12-LABEL: s_load_constant_v6i16_align8:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
;
@@ -1166,7 +1174,9 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg
define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg %ptr) {
; GFX12-LABEL: s_load_constant_v12i8_align8:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshr_b32 s13, s0, 8
; GFX12-NEXT: s_lshr_b32 s12, s0, 16
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
index 1140ef8..e1fcca0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -7,28 +7,28 @@ declare i32 @llvm.amdgcn.workitem.id.x()
define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
; GFX10-LABEL: v_mul_i64_no_zext:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[0:1]
-; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[4:5]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v0, v2, 0
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6]
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6]
; GFX10-NEXT: v_mov_b32_e32 v5, v0
-; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[2:3]
+; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i64_no_zext:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b64 v[0:1], v9, s[0:1]
-; GFX11-NEXT: global_load_b64 v[2:3], v9, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v9, s[4:5]
+; GFX11-NEXT: global_load_b64 v[2:3], v9, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v0, v2, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -36,7 +36,7 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac
; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v1, v2, v[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v5, v7
-; GFX11-NEXT: global_store_b64 v9, v[4:5], s[2:3]
+; GFX11-NEXT: global_store_b64 v9, v[4:5], s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -76,12 +76,12 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v1, s[6:7]
-; GFX11-NEXT: global_load_b32 v5, v2, s[0:1]
+; GFX11-NEXT: global_load_b32 v5, v2, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v5, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -128,12 +128,12 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v5, v1, s[6:7]
-; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -177,13 +177,13 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v1, v0, 0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -227,12 +227,12 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v5, v0, s[6:7]
-; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -277,12 +277,12 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_lo(ptr addrspace(1) %out, ptr a
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
-; GFX11-NEXT: global_load_b64 v[2:3], v2, s[0:1]
+; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -327,12 +327,12 @@ define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr a
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
-; GFX11-NEXT: global_load_b64 v[1:2], v2, s[0:1]
+; GFX11-NEXT: global_load_b64 v[1:2], v2, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_lo_u32 v1, v0, v2
; GFX11-NEXT: v_mov_b32_e32 v0, 0
@@ -355,21 +355,21 @@ define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr a
define amdgpu_kernel void @v_mul_i64_masked_src0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
; GFX10-LABEL: v_mul_i64_masked_src0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i64_masked_src0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -413,12 +413,12 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
-; GFX11-NEXT: global_load_b64 v[2:3], v2, s[0:1]
+; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -450,21 +450,21 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
define amdgpu_kernel void @v_mul64_masked_before_branch(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
; GFX10-LABEL: v_mul64_masked_before_branch:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul64_masked_before_branch:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -534,13 +534,13 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[2:3], v0, s[6:7]
-; GFX11-NEXT: global_load_b64 v[4:5], v0, s[0:1]
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: global_load_b64 v[4:5], v0, s[2:3]
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 2d81452..35de4a3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -2559,76 +2559,76 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: s_mul_u64_zext_with_sregs:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 0x50
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0
-; GFX8-NEXT: s_mulk_i32 s2, 0x50
-; GFX8-NEXT: v_readfirstlane_b32 s3, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
+; GFX8-NEXT: s_mulk_i32 s0, 0x50
+; GFX8-NEXT: v_readfirstlane_b32 s1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_u64_zext_with_sregs:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s2, s3, 0x50
-; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x50
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_mul_i32 s0, s1, 0x50
+; GFX9-NEXT: s_mul_hi_u32 s1, s1, 0x50
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_mul_u64_zext_with_sregs:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s3, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mul_i32 s2, s3, 0x50
-; GFX10-NEXT: s_mul_hi_u32 s3, s3, 0x50
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_mul_i32 s0, s1, 0x50
+; GFX10-NEXT: s_mul_hi_u32 s1, s1, 0x50
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_mul_u64_zext_with_sregs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s3, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s2, s3, 0x50
-; GFX11-NEXT: s_mul_hi_u32 s3, s3, 0x50
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_mul_i32 s0, s1, 0x50
+; GFX11-NEXT: s_mul_hi_u32 s1, s1, 0x50
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: s_mul_u64_zext_with_sregs:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT: s_mov_b32 s3, 0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50
+; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], 0x50
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2738,88 +2738,88 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: s_mul_u64_sext_with_sregs:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 0x50
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0
-; GFX8-NEXT: s_ashr_i32 s3, s2, 31
-; GFX8-NEXT: s_mulk_i32 s2, 0x50
-; GFX8-NEXT: s_mulk_i32 s3, 0x50
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_add_u32 s3, s3, s4
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: s_mulk_i32 s0, 0x50
+; GFX8-NEXT: s_mulk_i32 s1, 0x50
+; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: s_add_u32 s1, s1, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_u64_sext_with_sregs:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: s_mul_i32 s2, s3, 0x50
-; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x50
-; GFX9-NEXT: s_mulk_i32 s4, 0x50
-; GFX9-NEXT: s_add_u32 s3, s4, s3
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_ashr_i32 s2, s1, 31
+; GFX9-NEXT: s_mul_i32 s0, s1, 0x50
+; GFX9-NEXT: s_mul_hi_u32 s1, s1, 0x50
+; GFX9-NEXT: s_mulk_i32 s2, 0x50
+; GFX9-NEXT: s_add_u32 s1, s2, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_mul_u64_sext_with_sregs:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_ashr_i32 s3, s2, 31
-; GFX10-NEXT: s_mul_hi_u32 s4, s2, 0x50
-; GFX10-NEXT: s_mulk_i32 s3, 0x50
-; GFX10-NEXT: s_mulk_i32 s2, 0x50
-; GFX10-NEXT: s_add_i32 s3, s4, s3
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: s_mul_hi_u32 s2, s0, 0x50
+; GFX10-NEXT: s_mulk_i32 s1, 0x50
+; GFX10-NEXT: s_mulk_i32 s0, 0x50
+; GFX10-NEXT: s_add_i32 s1, s2, s1
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_mul_u64_sext_with_sregs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_ashr_i32 s3, s2, 31
-; GFX11-NEXT: s_mul_hi_u32 s4, s2, 0x50
-; GFX11-NEXT: s_mulk_i32 s3, 0x50
-; GFX11-NEXT: s_mulk_i32 s2, 0x50
-; GFX11-NEXT: s_add_i32 s3, s4, s3
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: s_mul_hi_u32 s2, s0, 0x50
+; GFX11-NEXT: s_mulk_i32 s1, 0x50
+; GFX11-NEXT: s_mulk_i32 s0, 0x50
+; GFX11-NEXT: s_add_i32 s1, s2, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: s_mul_u64_sext_with_sregs:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s3, s2, 31
+; GFX12-NEXT: s_ashr_i32 s1, s0, 31
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], 0x50
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index c3bd566..5d4f1f65 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -145,25 +145,25 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) {
; GFX8-LABEL: sdivrem_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_ashr_i32 s2, s9, 31
-; GFX8-NEXT: s_ashr_i32 s12, s11, 31
-; GFX8-NEXT: s_add_u32 s0, s8, s2
-; GFX8-NEXT: s_addc_u32 s1, s9, s2
-; GFX8-NEXT: s_add_u32 s8, s10, s12
-; GFX8-NEXT: s_mov_b32 s13, s12
-; GFX8-NEXT: s_addc_u32 s9, s11, s12
-; GFX8-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s9
-; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s8
+; GFX8-NEXT: s_ashr_i32 s2, s13, 31
+; GFX8-NEXT: s_ashr_i32 s4, s15, 31
+; GFX8-NEXT: s_add_u32 s0, s12, s2
+; GFX8-NEXT: s_addc_u32 s1, s13, s2
+; GFX8-NEXT: s_add_u32 s6, s14, s4
+; GFX8-NEXT: s_mov_b32 s5, s4
+; GFX8-NEXT: s_addc_u32 s7, s15, s4
+; GFX8-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7
+; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s6
; GFX8-NEXT: s_mov_b32 s3, s2
-; GFX8-NEXT: s_xor_b64 s[10:11], s[0:1], s[2:3]
+; GFX8-NEXT: s_xor_b64 s[12:13], s[0:1], s[2:3]
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT: s_sub_u32 s14, 0, s8
-; GFX8-NEXT: s_subb_u32 s15, 0, s9
+; GFX8-NEXT: s_sub_u32 s14, 0, s6
+; GFX8-NEXT: s_subb_u32 s15, 0, s7
; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX8-NEXT: v_trunc_f32_e32 v2, v1
@@ -223,53 +223,53 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v2, s11, v0
-; GFX8-NEXT: v_mul_lo_u32 v3, s10, v1
-; GFX8-NEXT: v_mul_hi_u32 v4, s10, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX8-NEXT: v_mul_hi_u32 v5, s11, v1
+; GFX8-NEXT: v_mul_lo_u32 v2, s13, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, s12, v1
+; GFX8-NEXT: v_mul_hi_u32 v4, s12, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, s13, v0
+; GFX8-NEXT: v_mul_hi_u32 v5, s13, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v4, s11, v1
+; GFX8-NEXT: v_mul_lo_u32 v4, s13, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_mul_hi_u32 v3, s10, v1
+; GFX8-NEXT: v_mul_hi_u32 v3, s12, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v4, 0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v6, s11
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s10, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v5, s9
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v6, s13
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s12, v0
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v4, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: v_subb_u32_e64 v2, s[0:1], v6, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s11, v1
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2
+; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s13, v1
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v2
; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v0
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v0
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v2
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v2
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[0:1]
-; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s8, v0
+; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s6, v0
; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v1, vcc
; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4
; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v8
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v7
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8
-; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v7
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v8
+; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7
; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1]
; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9
; GFX8-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
@@ -284,7 +284,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1]
-; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13]
+; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5]
; GFX8-NEXT: v_xor_b32_e32 v0, s0, v4
; GFX8-NEXT: v_xor_b32_e32 v1, s1, v3
; GFX8-NEXT: v_mov_b32_e32 v3, s1
@@ -295,35 +295,35 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_mov_b32_e32 v5, s2
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s2, v3
; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v4, v5, vcc
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: v_mov_b32_e32 v4, s8
+; GFX8-NEXT: v_mov_b32_e32 v5, s9
; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sdivrem_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_ashr_i32 s2, s9, 31
-; GFX9-NEXT: s_ashr_i32 s12, s11, 31
-; GFX9-NEXT: s_add_u32 s0, s8, s2
-; GFX9-NEXT: s_addc_u32 s1, s9, s2
-; GFX9-NEXT: s_add_u32 s8, s10, s12
-; GFX9-NEXT: s_mov_b32 s13, s12
-; GFX9-NEXT: s_addc_u32 s9, s11, s12
-; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8
+; GFX9-NEXT: s_ashr_i32 s2, s13, 31
+; GFX9-NEXT: s_ashr_i32 s4, s15, 31
+; GFX9-NEXT: s_add_u32 s0, s12, s2
+; GFX9-NEXT: s_addc_u32 s1, s13, s2
+; GFX9-NEXT: s_add_u32 s6, s14, s4
+; GFX9-NEXT: s_mov_b32 s5, s4
+; GFX9-NEXT: s_addc_u32 s7, s15, s4
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6
; GFX9-NEXT: s_mov_b32 s3, s2
-; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[2:3]
+; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[2:3]
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_sub_u32 s14, 0, s8
-; GFX9-NEXT: s_subb_u32 s15, 0, s9
+; GFX9-NEXT: s_sub_u32 s14, 0, s6
+; GFX9-NEXT: s_subb_u32 s15, 0, s7
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX9-NEXT: v_trunc_f32_e32 v2, v1
@@ -357,7 +357,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, s9
+; GFX9-NEXT: v_mov_b32_e32 v7, s7
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2]
; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2]
@@ -382,52 +382,52 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v2, s11, v0
-; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1
-; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX9-NEXT: v_mul_hi_u32 v6, s11, v1
+; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1
+; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0
+; GFX9-NEXT: v_mul_hi_u32 v6, s13, v1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v4, s11, v1
+; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1
; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT: v_mul_hi_u32 v3, s10, v1
+; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v5, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v6, s11
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s10, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2]
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v6, s13
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s12, v0
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v5, v[1:2]
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v6, v1, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2
-; GFX9-NEXT: v_sub_u32_e32 v1, s11, v1
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v2
+; GFX9-NEXT: v_sub_u32_e32 v1, s13, v1
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v0
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v0
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v2
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v2
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[0:1]
-; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s8, v0
+; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s6, v0
; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v1, vcc
; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5
; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v9
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v9
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v9
-; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s8, v8
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v9
+; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v8
; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1]
; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -442,7 +442,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1]
-; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13]
+; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5]
; GFX9-NEXT: v_xor_b32_e32 v0, s0, v5
; GFX9-NEXT: v_xor_b32_e32 v1, s1, v3
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -453,27 +453,27 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_mov_b32_e32 v6, s2
; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v3
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sdivrem_i64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_ashr_i32 s2, s9, 31
-; GFX10-NEXT: s_ashr_i32 s12, s11, 31
-; GFX10-NEXT: s_add_u32 s0, s8, s2
-; GFX10-NEXT: s_addc_u32 s1, s9, s2
-; GFX10-NEXT: s_add_u32 s8, s10, s12
-; GFX10-NEXT: s_mov_b32 s13, s12
-; GFX10-NEXT: s_addc_u32 s9, s11, s12
+; GFX10-NEXT: s_ashr_i32 s2, s13, 31
+; GFX10-NEXT: s_ashr_i32 s4, s15, 31
+; GFX10-NEXT: s_add_u32 s0, s12, s2
+; GFX10-NEXT: s_addc_u32 s1, s13, s2
+; GFX10-NEXT: s_add_u32 s6, s14, s4
+; GFX10-NEXT: s_mov_b32 s5, s4
+; GFX10-NEXT: s_addc_u32 s7, s15, s4
; GFX10-NEXT: s_mov_b32 s3, s2
-; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s9
-; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s8
-; GFX10-NEXT: s_sub_u32 s10, 0, s8
+; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7
+; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s6
+; GFX10-NEXT: s_sub_u32 s12, 0, s6
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -484,11 +484,12 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v2
; GFX10-NEXT: v_add_f32_e32 v0, v1, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s11, s10, v3, 0
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s11, s10, v4, v[1:2]
-; GFX10-NEXT: s_subb_u32 s11, 0, s9
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s13, s12, v3, 0
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s13, s12, v4, v[1:2]
+; GFX10-NEXT: s_subb_u32 s13, 0, s7
; GFX10-NEXT: v_mul_hi_u32 v6, v4, v0
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s14, s11, v3, v[1:2]
+; GFX10-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s14, s13, v3, v[1:2]
; GFX10-NEXT: v_mul_lo_u32 v2, v4, v0
; GFX10-NEXT: v_mul_hi_u32 v0, v3, v0
; GFX10-NEXT: v_mul_lo_u32 v5, v3, v1
@@ -510,28 +511,28 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v3, v0
; GFX10-NEXT: v_add3_u32 v1, v5, v2, v1
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v1, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s14, s10, v3, 0
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s10, s10, v4, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s14, s12, v3, 0
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s12, s12, v4, v[1:2]
; GFX10-NEXT: v_mul_hi_u32 v6, v4, v0
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s10, s11, v3, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s12, s13, v3, v[1:2]
; GFX10-NEXT: v_mul_lo_u32 v2, v4, v0
; GFX10-NEXT: v_mul_hi_u32 v0, v3, v0
; GFX10-NEXT: v_mul_lo_u32 v5, v3, v1
; GFX10-NEXT: v_mul_lo_u32 v7, v4, v1
; GFX10-NEXT: v_mul_hi_u32 v8, v3, v1
; GFX10-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX10-NEXT: v_add_co_u32 v2, s10, v2, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s10
-; GFX10-NEXT: v_add_co_u32 v6, s10, v7, v6
-; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s10
-; GFX10-NEXT: v_add_co_u32 v0, s10, v2, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s10
-; GFX10-NEXT: v_add_co_u32 v2, s10, v6, v8
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s10
+; GFX10-NEXT: v_add_co_u32 v2, s12, v2, v5
+; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s12
+; GFX10-NEXT: v_add_co_u32 v6, s12, v7, v6
+; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s12
+; GFX10-NEXT: v_add_co_u32 v0, s12, v2, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s12
+; GFX10-NEXT: v_add_co_u32 v2, s12, v6, v8
+; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s12
; GFX10-NEXT: v_add_nc_u32_e32 v0, v5, v0
; GFX10-NEXT: v_add_nc_u32_e32 v5, v7, v6
-; GFX10-NEXT: v_add_co_u32 v0, s10, v2, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s10
+; GFX10-NEXT: v_add_co_u32 v0, s12, v2, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s12
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v3, v0
; GFX10-NEXT: v_add3_u32 v1, v5, v2, v1
; GFX10-NEXT: v_mul_lo_u32 v2, s1, v0
@@ -540,71 +541,70 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_mul_hi_u32 v0, s1, v0
; GFX10-NEXT: v_mul_lo_u32 v3, s0, v1
; GFX10-NEXT: v_mul_lo_u32 v5, s1, v1
-; GFX10-NEXT: v_add_co_u32 v2, s10, v2, v3
+; GFX10-NEXT: v_add_co_u32 v2, s12, v2, v3
; GFX10-NEXT: v_mul_hi_u32 v3, s0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s10
-; GFX10-NEXT: v_add_co_u32 v2, s10, v2, v4
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s10
-; GFX10-NEXT: v_add_co_u32 v0, s10, v5, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s10
+; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s12
+; GFX10-NEXT: v_add_co_u32 v2, s12, v2, v4
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s12
+; GFX10-NEXT: v_add_co_u32 v0, s12, v5, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s12
; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2
-; GFX10-NEXT: v_add_co_u32 v0, s10, v0, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s10
-; GFX10-NEXT: v_add_co_u32 v5, s10, v0, v2
+; GFX10-NEXT: v_add_co_u32 v0, s12, v0, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s12
+; GFX10-NEXT: v_add_co_u32 v5, s12, v0, v2
; GFX10-NEXT: v_mul_hi_u32 v2, s1, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s10
+; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s12
; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s10, s8, v5, 0
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s12, s6, v5, 0
; GFX10-NEXT: v_add3_u32 v3, v3, v6, v2
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s10, s8, v3, v[1:2]
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s10, s9, v5, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s12, s6, v3, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s12, s7, v5, v[1:2]
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v5, 1
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v6, s1, v1
; GFX10-NEXT: v_sub_co_ci_u32_e64 v1, s0, s1, v1, vcc_lo
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s8, v0
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v0
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, s8
+; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, s6
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v6, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v1
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v1
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v8
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v8
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v9
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v9
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0
; GFX10-NEXT: v_add_co_u32 v13, s0, v2, 1
; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s0, 0, v4, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v9
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v9
; GFX10-NEXT: v_cndmask_b32_e64 v11, v12, v11, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v1
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v1
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11
; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v7, s0
-; GFX10-NEXT: v_sub_co_u32 v10, s0, v8, s8
+; GFX10-NEXT: v_sub_co_u32 v10, s0, v8, s6
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v7
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v7, v8, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc_lo
-; GFX10-NEXT: s_xor_b64 s[8:9], s[2:3], s[12:13]
; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
-; GFX10-NEXT: v_xor_b32_e32 v2, s8, v2
-; GFX10-NEXT: v_xor_b32_e32 v3, s9, v3
+; GFX10-NEXT: v_xor_b32_e32 v2, s4, v2
+; GFX10-NEXT: v_xor_b32_e32 v3, s5, v3
; GFX10-NEXT: v_xor_b32_e32 v5, s2, v0
; GFX10-NEXT: v_xor_b32_e32 v6, s2, v1
-; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v2, s8
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s9, v3, vcc_lo
+; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v2, s4
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s5, v3, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v5, s2
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s2, v6, vcc_lo
-; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
-; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7]
+; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
; GFX10-NEXT: s_endpgm
%div = sdiv i64 %x, %y
store i64 %div, ptr addrspace(1) %out0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 63a0d8a..51c213e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -112,12 +112,12 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) {
; GFX8-LABEL: udivrem_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s11
-; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s10
-; GFX8-NEXT: s_sub_u32 s2, 0, s10
-; GFX8-NEXT: s_subb_u32 s3, 0, s11
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s15
+; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s14
+; GFX8-NEXT: s_sub_u32 s2, 0, s14
+; GFX8-NEXT: s_subb_u32 s3, 0, s15
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -180,53 +180,53 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0
-; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1
-; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0
-; GFX8-NEXT: v_mul_hi_u32 v5, s9, v1
+; GFX8-NEXT: v_mul_lo_u32 v2, s13, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, s12, v1
+; GFX8-NEXT: v_mul_hi_u32 v4, s12, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, s13, v0
+; GFX8-NEXT: v_mul_hi_u32 v5, s13, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v4, s9, v1
+; GFX8-NEXT: v_mul_lo_u32 v4, s13, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_mul_hi_u32 v3, s8, v1
+; GFX8-NEXT: v_mul_hi_u32 v3, s12, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v4, 0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v3, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v6, s9
-; GFX8-NEXT: v_mov_b32_e32 v5, s11
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2]
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v0
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v3, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v6, s13
+; GFX8-NEXT: v_mov_b32_e32 v5, s15
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v4, v[1:2]
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s12, v0
; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s9, v1
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v6
+; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s13, v1
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v6
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v2
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v6
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v6
; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[0:1]
-; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s10, v2
+; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s14, v2
; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v0, vcc
; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4
; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v8
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v8
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v7
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7
; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v8
-; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s10, v7
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v8
+; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s14, v7
; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1]
; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9
; GFX8-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v0, vcc
@@ -241,22 +241,22 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v14, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v4, s[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: v_mov_b32_e32 v4, s8
+; GFX8-NEXT: v_mov_b32_e32 v5, s9
; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: udivrem_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s11
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s10
-; GFX9-NEXT: s_sub_u32 s2, 0, s10
-; GFX9-NEXT: s_subb_u32 s3, 0, s11
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s15
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s14
+; GFX9-NEXT: s_sub_u32 s2, 0, s14
+; GFX9-NEXT: s_subb_u32 s3, 0, s15
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -293,7 +293,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, s11
+; GFX9-NEXT: v_mov_b32_e32 v7, s15
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
@@ -318,52 +318,52 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0
-; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1
-; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0
-; GFX9-NEXT: v_mul_hi_u32 v6, s9, v1
+; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1
+; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0
+; GFX9-NEXT: v_mul_hi_u32 v6, s13, v1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1
+; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1
; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1
+; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v5, 0
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v5, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v3, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v6, s9
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v3, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v6, s13
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v5, v[1:2]
-; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v0
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v5, v[1:2]
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s12, v0
; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v6
-; GFX9-NEXT: v_sub_u32_e32 v0, s9, v1
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v6
+; GFX9-NEXT: v_sub_u32_e32 v0, s13, v1
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v2
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v6
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v6
; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v7, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[0:1]
-; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s10, v2
+; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s14, v2
; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v0, vcc
; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5
; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v9
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v9
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v8
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v8
; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v7, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v9
-; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s10, v8
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v9
+; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s14, v8
; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1]
; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10
; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
@@ -378,17 +378,17 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v15, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v5, s[0:1]
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: udivrem_i64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s11
-; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s10
-; GFX10-NEXT: s_sub_u32 s0, 0, s10
+; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s15
+; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s14
+; GFX10-NEXT: s_sub_u32 s0, 0, s14
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -401,7 +401,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v0
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s0, v3, 0
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s1, s0, v4, v[1:2]
-; GFX10-NEXT: s_subb_u32 s1, 0, s11
+; GFX10-NEXT: s_subb_u32 s1, 0, s15
; GFX10-NEXT: v_mul_hi_u32 v6, v4, v0
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, s1, v3, v[1:2]
; GFX10-NEXT: v_mul_lo_u32 v2, v4, v0
@@ -449,14 +449,14 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v3, v0
; GFX10-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX10-NEXT: v_mul_lo_u32 v2, s9, v0
+; GFX10-NEXT: v_mul_lo_u32 v2, s13, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo
-; GFX10-NEXT: v_mul_hi_u32 v4, s8, v0
-; GFX10-NEXT: v_mul_hi_u32 v0, s9, v0
-; GFX10-NEXT: v_mul_lo_u32 v3, s8, v1
-; GFX10-NEXT: v_mul_lo_u32 v5, s9, v1
+; GFX10-NEXT: v_mul_hi_u32 v4, s12, v0
+; GFX10-NEXT: v_mul_hi_u32 v0, s13, v0
+; GFX10-NEXT: v_mul_lo_u32 v3, s12, v1
+; GFX10-NEXT: v_mul_lo_u32 v5, s13, v1
; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v3
-; GFX10-NEXT: v_mul_hi_u32 v3, s8, v1
+; GFX10-NEXT: v_mul_hi_u32 v3, s12, v1
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v4
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
@@ -466,38 +466,38 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v3
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX10-NEXT: v_add_co_u32 v5, s0, v0, v2
-; GFX10-NEXT: v_mul_hi_u32 v2, s9, v1
+; GFX10-NEXT: v_mul_hi_u32 v2, s13, v1
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s10, v5, 0
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s14, v5, 0
; GFX10-NEXT: v_add3_u32 v3, v3, v6, v2
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s10, v3, v[1:2]
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s11, v5, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s14, v3, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s15, v5, v[1:2]
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v5, 1
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, s8, v0
-; GFX10-NEXT: v_sub_nc_u32_e32 v6, s9, v1
-; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s0, s9, v1, vcc_lo
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s11, v6, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s10, v7
+; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, s12, v0
+; GFX10-NEXT: v_sub_nc_u32_e32 v6, s13, v1
+; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s0, s13, v1, vcc_lo
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s15, v6, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v7
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s10
+; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s14
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v0, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v8
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s11, v0, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v8
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s15, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v6
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v6
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v9
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v9
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0
; GFX10-NEXT: v_add_co_u32 v13, s0, v2, 1
; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s0, 0, v4, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v9
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s15, v9
; GFX10-NEXT: v_cndmask_b32_e64 v11, v12, v11, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v8
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s15, v8
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11
; GFX10-NEXT: v_cndmask_b32_e64 v1, v10, v1, s0
-; GFX10-NEXT: v_sub_co_u32 v10, s0, v6, s10
+; GFX10-NEXT: v_sub_co_u32 v10, s0, v6, s14
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v0, s0, 0, v0, s0
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo
@@ -509,8 +509,8 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v4, s0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v6, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v9, s0
-; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5]
-; GFX10-NEXT: global_store_dwordx2 v10, v[2:3], s[6:7]
+; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[8:9]
+; GFX10-NEXT: global_store_dwordx2 v10, v[2:3], s[10:11]
; GFX10-NEXT: s_endpgm
%div = udiv i64 %x, %y
store i64 %div, ptr addrspace(1) %out0
@@ -979,13 +979,13 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1
define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) {
; GFX8-LABEL: udivrem_v2i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20
-; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX8-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x20
+; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s13
-; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s12
-; GFX8-NEXT: s_sub_u32 s2, 0, s12
-; GFX8-NEXT: s_subb_u32 s3, 0, s13
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s17
+; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s16
+; GFX8-NEXT: s_sub_u32 s2, 0, s16
+; GFX8-NEXT: s_subb_u32 s3, 0, s17
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -1025,12 +1025,12 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT: s_sub_u32 s2, 0, s14
+; GFX8-NEXT: s_sub_u32 s2, 0, s18
; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1
-; GFX8-NEXT: s_subb_u32 s3, 0, s15
+; GFX8-NEXT: s_subb_u32 s3, 0, s19
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
@@ -1050,46 +1050,46 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0
-; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1
-; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0
+; GFX8-NEXT: v_mul_lo_u32 v2, s13, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, s12, v1
+; GFX8-NEXT: v_mul_hi_u32 v4, s12, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, s13, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v4, s9, v1
+; GFX8-NEXT: v_mul_lo_u32 v4, s13, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_mul_hi_u32 v3, s8, v1
+; GFX8-NEXT: v_mul_hi_u32 v3, s12, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2
-; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v6, 0
+; GFX8-NEXT: v_mul_hi_u32 v4, s13, v1
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v7, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v3, s9
-; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s8, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s13, v6, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v4, s13
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v7, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v3, s13
+; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s12, v0
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v6, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v4, s17
; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v3, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s9, v1
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v0
+; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s13, v1
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v8
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v8
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v0
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s17, v0
; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v3, s[0:1]
-; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s15
+; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s19
; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v4, vcc
-; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s14
+; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s18
; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2
-; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v8
+; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s16, v8
; GFX8-NEXT: v_add_f32_e32 v1, v2, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v5, vcc
@@ -1101,13 +1101,13 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_add_f32_e32 v1, v2, v1
; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v1
; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v7, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v11
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v11
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v10
; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v15, 0
; GFX8-NEXT: v_cvt_u32_f32_e32 v14, v14
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v11
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s17, v11
; GFX8-NEXT: v_cndmask_b32_e64 v16, v3, v16, s[0:1]
; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v14, v[2:3]
; GFX8-NEXT: v_add_u32_e64 v17, s[0:1], 1, v12
@@ -1116,7 +1116,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v5, v4, vcc
; GFX8-NEXT: v_mul_lo_u32 v4, v14, v1
; GFX8-NEXT: v_mul_lo_u32 v5, v15, v2
-; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v10
+; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s16, v10
; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v3, vcc
; GFX8-NEXT: v_mul_hi_u32 v3, v15, v1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
@@ -1175,55 +1175,55 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v15, v3
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc
-; GFX8-NEXT: v_mul_lo_u32 v7, s11, v3
-; GFX8-NEXT: v_mul_lo_u32 v8, s10, v4
+; GFX8-NEXT: v_mul_lo_u32 v7, s15, v3
+; GFX8-NEXT: v_mul_lo_u32 v8, s14, v4
; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1]
-; GFX8-NEXT: v_mul_hi_u32 v0, s10, v3
-; GFX8-NEXT: v_mul_hi_u32 v3, s11, v3
+; GFX8-NEXT: v_mul_hi_u32 v0, s14, v3
+; GFX8-NEXT: v_mul_hi_u32 v3, s15, v3
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v7, s11, v4
+; GFX8-NEXT: v_mul_lo_u32 v7, s15, v4
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v8, v0
-; GFX8-NEXT: v_mul_hi_u32 v8, s10, v4
+; GFX8-NEXT: v_mul_hi_u32 v8, s14, v4
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v7, v3
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v3, v0
-; GFX8-NEXT: v_mul_hi_u32 v8, s11, v4
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s14, v9, 0
+; GFX8-NEXT: v_mul_hi_u32 v8, s15, v4
+; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s18, v9, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v8, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v4
-; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s14, v10, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, s11
-; GFX8-NEXT: v_mov_b32_e32 v0, s15
-; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s15, v9, v[7:8]
-; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s10, v3
+; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s18, v10, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v4, s15
+; GFX8-NEXT: v_mov_b32_e32 v0, s19
+; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s19, v9, v[7:8]
+; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s14, v3
; GFX8-NEXT: v_subb_u32_e64 v11, s[0:1], v4, v7, vcc
-; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s11, v7
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v11
+; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s15, v7
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v11
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v8
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v8
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v11
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v11
; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1]
-; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s14, v8
+; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s18, v8
; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v3, vcc
; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v9
; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v10, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v12
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v12
; GFX8-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v7
; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v3, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v12
-; GFX8-NEXT: v_subrev_u32_e32 v18, vcc, s14, v7
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v12
+; GFX8-NEXT: v_subrev_u32_e32 v18, vcc, s18, v7
; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[0:1]
; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v13
; GFX8-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
@@ -1234,30 +1234,30 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, v13, s[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v10, s5
+; GFX8-NEXT: v_mov_b32_e32 v10, s9
; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v18, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc
-; GFX8-NEXT: v_mov_b32_e32 v9, s4
+; GFX8-NEXT: v_mov_b32_e32 v9, s8
; GFX8-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v8, v11, v0, s[0:1]
; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[1:4]
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[5:8]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: udivrem_v2i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20
+; GFX9-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x20
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s13
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s12
-; GFX9-NEXT: s_sub_u32 s2, 0, s12
-; GFX9-NEXT: s_subb_u32 s3, 0, s13
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s17
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s16
+; GFX9-NEXT: s_sub_u32 s2, 0, s16
+; GFX9-NEXT: s_subb_u32 s3, 0, s17
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX9-NEXT: v_trunc_f32_e32 v2, v1
@@ -1293,12 +1293,12 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT: s_sub_u32 s2, 0, s14
+; GFX9-NEXT: s_sub_u32 s2, 0, s18
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT: s_subb_u32 s3, 0, s15
+; GFX9-NEXT: s_subb_u32 s3, 0, s19
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
@@ -1317,48 +1317,47 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0
-; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1
-; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0
-; GFX9-NEXT: v_mul_hi_u32 v5, s9, v1
+; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1
+; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0
+; GFX9-NEXT: v_mul_hi_u32 v5, s13, v1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1
+; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1
; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1
-; GFX9-NEXT: v_mov_b32_e32 v6, s13
+; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1
+; GFX9-NEXT: v_mov_b32_e32 v6, s17
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v8, 0
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v8, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
; GFX9-NEXT: v_add3_u32 v9, v3, v0, v5
; GFX9-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v9, v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v5, s9
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v9, v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v5, s13
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s13, v8, v[2:3]
-; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v1
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s17, v8, v[2:3]
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s12, v1
; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v5, v3, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v1
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v1
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v2
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v2
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v1
-; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s17, v1
+; GFX9-NEXT: v_sub_u32_e32 v3, s13, v3
; GFX9-NEXT: v_cndmask_b32_e64 v10, v4, v5, s[0:1]
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s15
+; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s19
; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v6, vcc
-; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s14
+; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s18
; GFX9-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4
-; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s12, v2
+; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s16, v2
; GFX9-NEXT: v_add_f32_e32 v3, v4, v3
; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3
; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc
@@ -1370,13 +1369,13 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_add_f32_e32 v3, v4, v3
; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v3
; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v9, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v12
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v12
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v11
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v11
; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1]
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v16, 0
; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v15
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v12
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s17, v12
; GFX9-NEXT: v_cndmask_b32_e64 v17, v5, v17, s[0:1]
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[4:5]
; GFX9-NEXT: v_add_co_u32_e64 v18, s[0:1], 1, v13
@@ -1385,7 +1384,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v6, vcc
; GFX9-NEXT: v_mul_lo_u32 v6, v15, v3
; GFX9-NEXT: v_mul_lo_u32 v7, v16, v4
-; GFX9-NEXT: v_subrev_co_u32_e32 v20, vcc, s12, v11
+; GFX9-NEXT: v_subrev_co_u32_e32 v20, vcc, s16, v11
; GFX9-NEXT: v_subbrev_co_u32_e32 v21, vcc, 0, v5, vcc
; GFX9-NEXT: v_mul_hi_u32 v5, v16, v3
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7
@@ -1441,55 +1440,55 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_add3_u32 v6, v9, v8, v6
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v16, v5
; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v6, vcc
-; GFX9-NEXT: v_mul_lo_u32 v8, s11, v5
-; GFX9-NEXT: v_mul_lo_u32 v9, s10, v6
+; GFX9-NEXT: v_mul_lo_u32 v8, s15, v5
+; GFX9-NEXT: v_mul_lo_u32 v9, s14, v6
; GFX9-NEXT: v_cndmask_b32_e64 v7, v2, v7, s[0:1]
-; GFX9-NEXT: v_mul_hi_u32 v2, s10, v5
-; GFX9-NEXT: v_mul_hi_u32 v5, s11, v5
+; GFX9-NEXT: v_mul_hi_u32 v2, s14, v5
+; GFX9-NEXT: v_mul_hi_u32 v5, s15, v5
; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v8, s11, v6
+; GFX9-NEXT: v_mul_lo_u32 v8, s15, v6
; GFX9-NEXT: v_add_u32_e32 v2, v9, v2
-; GFX9-NEXT: v_mul_hi_u32 v9, s10, v6
-; GFX9-NEXT: v_mul_hi_u32 v13, s11, v6
+; GFX9-NEXT: v_mul_hi_u32 v9, s14, v6
+; GFX9-NEXT: v_mul_hi_u32 v13, s15, v6
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v8, v5
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v5, v2
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s14, v12, 0
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s18, v12, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v10, s[0:1]
; GFX9-NEXT: v_add_u32_e32 v1, v11, v9
; GFX9-NEXT: v_add3_u32 v9, v1, v2, v13
; GFX9-NEXT: v_mov_b32_e32 v1, v6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v9, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v10, s11
-; GFX9-NEXT: v_mov_b32_e32 v6, s15
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v12, v[1:2]
-; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s10, v5
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v9, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v10, s15
+; GFX9-NEXT: v_mov_b32_e32 v6, s19
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v12, v[1:2]
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s14, v5
; GFX9-NEXT: v_subb_co_u32_e64 v10, s[0:1], v10, v1, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v10
-; GFX9-NEXT: v_sub_u32_e32 v1, s11, v1
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v10
+; GFX9-NEXT: v_sub_u32_e32 v1, s15, v1
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v2
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v2
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v10
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v10
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[0:1]
-; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s14, v2
+; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s18, v2
; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v1, vcc
; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v12
; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v9, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v13
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v13
; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v11
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v11
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v13
-; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s14, v11
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v13
+; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s18, v11
; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1]
; GFX9-NEXT: v_add_co_u32_e64 v17, s[0:1], 1, v14
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -1504,22 +1503,24 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[0:1]
-; GFX9-NEXT: global_store_dwordx4 v0, v[3:6], s[4:5]
-; GFX9-NEXT: global_store_dwordx4 v0, v[7:10], s[6:7]
+; GFX9-NEXT: global_store_dwordx4 v0, v[3:6], s[8:9]
+; GFX9-NEXT: global_store_dwordx4 v0, v[7:10], s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: udivrem_v2i64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x20
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s13
-; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15
-; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s12
-; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s14
-; GFX10-NEXT: s_sub_u32 s0, 0, s12
+; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s17
+; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s19
+; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s16
+; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s18
+; GFX10-NEXT: s_sub_u32 s0, 0, s16
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1
-; GFX10-NEXT: s_subb_u32 s1, 0, s13
+; GFX10-NEXT: s_subb_u32 s1, 0, s17
; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -1539,13 +1540,13 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v1
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, s0, v7, 0
-; GFX10-NEXT: s_sub_u32 s2, 0, s14
+; GFX10-NEXT: s_sub_u32 s2, 0, s18
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s3, s2, v8, 0
; GFX10-NEXT: v_mul_hi_u32 v11, v9, v0
; GFX10-NEXT: v_mad_u64_u32 v[4:5], s3, s0, v9, v[1:2]
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s3, s2, v10, v[3:4]
; GFX10-NEXT: v_mul_lo_u32 v6, v9, v0
-; GFX10-NEXT: s_subb_u32 s3, 0, s15
+; GFX10-NEXT: s_subb_u32 s3, 0, s19
; GFX10-NEXT: v_mad_u64_u32 v[3:4], s6, s1, v7, v[4:5]
; GFX10-NEXT: v_mul_hi_u32 v4, v7, v0
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s6, s3, v8, v[5:6]
@@ -1592,7 +1593,6 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s6, s0, v7, 0
; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v10, v2, vcc_lo
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s6, s2, v8, 0
-; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
; GFX10-NEXT: v_mul_hi_u32 v11, v9, v0
; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s0, v9, v[1:2]
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s2, v10, v[3:4]
@@ -1641,21 +1641,20 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v9, v3, vcc_lo
; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v8, v1
; GFX10-NEXT: v_add_co_ci_u32_e32 v0, vcc_lo, v10, v0, vcc_lo
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mul_lo_u32 v3, s9, v4
-; GFX10-NEXT: v_mul_lo_u32 v8, s8, v2
-; GFX10-NEXT: v_mul_hi_u32 v5, s8, v4
-; GFX10-NEXT: v_mul_hi_u32 v4, s9, v4
-; GFX10-NEXT: v_mul_lo_u32 v9, s9, v2
-; GFX10-NEXT: v_mul_lo_u32 v6, s11, v1
-; GFX10-NEXT: v_mul_hi_u32 v10, s8, v2
-; GFX10-NEXT: v_mul_hi_u32 v11, s9, v2
-; GFX10-NEXT: v_mul_lo_u32 v2, s10, v0
-; GFX10-NEXT: v_mul_hi_u32 v7, s10, v1
-; GFX10-NEXT: v_mul_hi_u32 v1, s11, v1
-; GFX10-NEXT: v_mul_lo_u32 v12, s11, v0
-; GFX10-NEXT: v_mul_hi_u32 v13, s10, v0
-; GFX10-NEXT: v_mul_hi_u32 v14, s11, v0
+; GFX10-NEXT: v_mul_lo_u32 v3, s13, v4
+; GFX10-NEXT: v_mul_lo_u32 v8, s12, v2
+; GFX10-NEXT: v_mul_hi_u32 v5, s12, v4
+; GFX10-NEXT: v_mul_hi_u32 v4, s13, v4
+; GFX10-NEXT: v_mul_lo_u32 v9, s13, v2
+; GFX10-NEXT: v_mul_lo_u32 v6, s15, v1
+; GFX10-NEXT: v_mul_hi_u32 v10, s12, v2
+; GFX10-NEXT: v_mul_hi_u32 v11, s13, v2
+; GFX10-NEXT: v_mul_lo_u32 v2, s14, v0
+; GFX10-NEXT: v_mul_hi_u32 v7, s14, v1
+; GFX10-NEXT: v_mul_hi_u32 v1, s15, v1
+; GFX10-NEXT: v_mul_lo_u32 v12, s15, v0
+; GFX10-NEXT: v_mul_hi_u32 v13, s14, v0
+; GFX10-NEXT: v_mul_hi_u32 v14, s15, v0
; GFX10-NEXT: v_add_co_u32 v0, s0, v3, v8
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX10-NEXT: v_add_co_u32 v4, s0, v9, v4
@@ -1678,77 +1677,77 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX10-NEXT: v_add_co_u32 v8, s0, v4, v0
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0
; GFX10-NEXT: v_add_co_u32 v10, s0, v1, v2
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s12, v8, 0
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s16, v8, 0
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s14, v10, 0
+; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s18, v10, 0
; GFX10-NEXT: v_add_nc_u32_e32 v7, v9, v7
; GFX10-NEXT: v_add3_u32 v9, v5, v4, v11
; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, v8, 1
; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: v_add3_u32 v7, v7, v6, v14
-; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s12, v9, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s16, v9, v[1:2]
; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v9, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s14, v7, v[3:4]
-; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s13, v8, v[4:5]
+; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s18, v7, v[3:4]
+; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s17, v8, v[4:5]
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v12, 1
; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v13, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, s8, v0
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s15, v10, v[5:6]
-; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s0, s9, v3, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s12, v14
-; GFX10-NEXT: v_sub_nc_u32_e32 v1, s9, v3
+; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, s12, v0
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s19, v10, v[5:6]
+; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s0, s13, v3, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s16, v14
+; GFX10-NEXT: v_sub_nc_u32_e32 v1, s13, v3
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, -1, s0
-; GFX10-NEXT: v_sub_co_u32 v15, s0, s10, v2
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s13, v1, vcc_lo
-; GFX10-NEXT: v_sub_co_ci_u32_e64 v16, s1, s11, v0, s0
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v15
-; GFX10-NEXT: v_sub_nc_u32_e32 v0, s11, v0
+; GFX10-NEXT: v_sub_co_u32 v15, s0, s14, v2
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s17, v1, vcc_lo
+; GFX10-NEXT: v_sub_co_ci_u32_e64 v16, s1, s15, v0, s0
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s18, v15
+; GFX10-NEXT: v_sub_nc_u32_e32 v0, s15, v0
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v17, vcc_lo, v14, s12
+; GFX10-NEXT: v_sub_co_u32 v17, vcc_lo, v14, s16
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s1, 0, v1, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e64 s1, s13, v5
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v23, s0, s15, v0, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v18
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s13, v1, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e64 s1, s17, v5
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v23, s0, s19, v0, s0
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s17, v18
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s17, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, -1, s1
-; GFX10-NEXT: v_cmp_le_u32_e64 s1, s12, v17
+; GFX10-NEXT: v_cmp_le_u32_e64 s1, s16, v17
; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, -1, s1
-; GFX10-NEXT: v_cmp_le_u32_e64 s1, s13, v18
+; GFX10-NEXT: v_cmp_le_u32_e64 s1, s17, v18
; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, -1, s1
-; GFX10-NEXT: v_cmp_le_u32_e64 s1, s15, v16
+; GFX10-NEXT: v_cmp_le_u32_e64 s1, s19, v16
; GFX10-NEXT: v_cndmask_b32_e64 v0, v21, v20, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v5
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s17, v5
; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, -1, s1
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0
-; GFX10-NEXT: v_sub_co_u32 v0, s0, v17, s12
+; GFX10-NEXT: v_sub_co_u32 v0, s0, v17, s16
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v19, s0, 0, v1, s0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v3
; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v6, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v6, s1, v15, s14
+; GFX10-NEXT: v_sub_co_u32 v6, s1, v15, s18
; GFX10-NEXT: v_cndmask_b32_e32 v4, v17, v0, vcc_lo
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v12, s2, 0, v23, s1
; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v1, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v3, s0
; GFX10-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s15, v16
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s19, v16
; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v4, s0
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v3, s0
; GFX10-NEXT: v_cndmask_b32_e32 v2, v22, v2, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s15, v12
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s19, v12
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v6
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s18, v6
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v13, vcc_lo, v10, 1
; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, 0, v7, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s15, v12
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s19, v12
; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v13, 1
; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, 0, v14, vcc_lo
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, vcc_lo, s15, v23, s1
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, vcc_lo, s19, v23, s1
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX10-NEXT: v_sub_co_u32 v8, s1, v6, s14
+; GFX10-NEXT: v_sub_co_u32 v8, s1, v6, s18
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s1, 0, v18, s1
; GFX10-NEXT: v_cndmask_b32_e32 v9, v13, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v13, v14, v17, vcc_lo
@@ -1759,8 +1758,8 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v13, s1
; GFX10-NEXT: v_cndmask_b32_e64 v6, v15, v6, s1
; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v8, s1
-; GFX10-NEXT: global_store_dwordx4 v11, v[0:3], s[4:5]
-; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[6:7]
+; GFX10-NEXT: global_store_dwordx4 v11, v[0:3], s[8:9]
+; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[10:11]
; GFX10-NEXT: s_endpgm
%div = udiv <2 x i64> %x, %y
store <2 x i64> %div, ptr addrspace(1) %out0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
index 037210a..a2439e5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
@@ -6,36 +6,36 @@
define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr addrspace(4) %in) #0 {
; GFX8-LABEL: constant_load_i8_align4:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: constant_load_i8_align4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_store_byte v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_load_i8_align4:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_store_byte v1, v0, s[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: global_store_byte v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%ld = load i8, ptr addrspace(4) %in, align 4
store i8 %ld, ptr addrspace(1) %out, align 4
@@ -45,36 +45,36 @@ define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr a
define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr addrspace(4) %in) #0 {
; GFX8-LABEL: constant_load_i16_align4:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: constant_load_i16_align4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_store_short v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_load_i16_align4:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_store_short v1, v0, s[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: global_store_short v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%ld = load i16, ptr addrspace(4) %in, align 4
store i16 %ld, ptr addrspace(1) %out, align 4
@@ -84,39 +84,39 @@ define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr
define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX8-LABEL: sextload_i8_to_i32_align4:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sext_i32_i8 s2, s2
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_sext_i32_i8 s0, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sextload_i8_to_i32_align4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sext_i32_i8 s2, s2
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_sext_i32_i8 s0, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sextload_i8_to_i32_align4:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_sext_i32_i8 s2, s2
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: s_sext_i32_i8 s0, s0
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 4
%sext = sext i8 %load to i32
@@ -127,39 +127,39 @@ define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr
define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX8-LABEL: sextload_i16_to_i32_align4:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sext_i32_i16 s2, s2
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_sext_i32_i16 s0, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sextload_i16_to_i32_align4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sext_i32_i16 s2, s2
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_sext_i32_i16 s0, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sextload_i16_to_i32_align4:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_sext_i32_i16 s2, s2
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: s_sext_i32_i16 s0, s0
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i16, ptr addrspace(1) %in, align 4
%sext = sext i16 %load to i32
@@ -170,39 +170,39 @@ define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr
define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX8-LABEL: zextload_i8_to_i32_align4:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b32 s2, s2, 0xff
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_and_b32 s0, s0, 0xff
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: zextload_i8_to_i32_align4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s2, s2, 0xff
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_and_b32 s0, s0, 0xff
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: zextload_i8_to_i32_align4:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_and_b32 s2, s2, 0xff
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: s_and_b32 s0, s0, 0xff
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 4
%zext = zext i8 %load to i32
@@ -213,39 +213,39 @@ define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr
define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX8-LABEL: zextload_i16_to_i32_align4:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: zextload_i16_to_i32_align4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: zextload_i16_to_i32_align4:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i16, ptr addrspace(1) %in, align 4
%zext = zext i16 %load to i32
@@ -256,35 +256,35 @@ define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr
define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX8-LABEL: constant_load_i8_align2:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: constant_load_i8_align2:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
+; GFX9-NEXT: global_store_byte v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_load_i8_align2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
+; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
+; GFX10-NEXT: global_store_byte v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 2
store i8 %load, ptr addrspace(1) %out, align 2
@@ -294,35 +294,35 @@ define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr ad
define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX8-LABEL: constant_load_i16_align2:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: constant_load_i16_align2:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_load_i16_align2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i16, ptr addrspace(1) %in, align 2
store i16 %load, ptr addrspace(1) %out, align 2
@@ -332,43 +332,43 @@ define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr a
define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX8-LABEL: constant_sextload_i8_align2:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_sbyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_add_u32 s2, s0, 2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_add_u32 s0, s4, 2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_short v[0:1], v2
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: constant_sextload_i8_align2:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_sbyte v1, v0, s[2:3]
+; GFX9-NEXT: global_load_sbyte v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
-; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
+; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_sextload_i8_align2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_sbyte v1, v0, s[2:3]
+; GFX10-NEXT: global_load_sbyte v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
-; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
+; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 2
%sextload = sext i8 %load to i32
@@ -379,43 +379,43 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt
define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX8-LABEL: constant_zextload_i8_align2:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_add_u32 s2, s0, 2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_add_u32 s0, s4, 2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_short v[0:1], v2
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: constant_zextload_i8_align2:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
-; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
+; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_zextload_i8_align2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
+; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
-; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
+; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 2
%zextload = zext i8 %load to i32
diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll
index 422e274..cdf03ae 100644
--- a/llvm/test/CodeGen/AMDGPU/add.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.ll
@@ -22,65 +22,65 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; GFX8-LABEL: s_add_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_add_i32 s0, s2, s3
+; GFX8-NEXT: s_add_i32 s0, s0, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_add_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_i32 s2, s2, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_add_i32 s0, s0, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_add_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_i32 s2, s2, s3
-; GFX10-NEXT: v_mov_b32_e32 v1, s2
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: s_add_i32 s0, s0, s1
+; GFX10-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_add_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s2, s2, s3
+; GFX11-NEXT: s_add_i32 s0, s0, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: s_add_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_co_i32 s2, s2, s3
+; GFX12-NEXT: s_add_co_i32 s0, s0, s1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -110,75 +110,75 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX8-LABEL: s_add_v2i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_add_i32 s0, s5, s7
-; GFX8-NEXT: s_add_i32 s1, s4, s6
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NEXT: s_add_i32 s1, s1, s3
+; GFX8-NEXT: s_add_i32 s0, s0, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_add_v2i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_i32 s2, s5, s7
-; GFX9-NEXT: s_add_i32 s3, s4, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_add_i32 s1, s1, s3
+; GFX9-NEXT: s_add_i32 s0, s0, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_add_v2i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_i32 s2, s4, s6
-; GFX10-NEXT: s_add_i32 s3, s5, s7
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_add_i32 s0, s0, s2
+; GFX10-NEXT: s_add_i32 s1, s1, s3
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_add_v2i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX11-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s2, s4, s6
-; GFX11-NEXT: s_add_i32 s3, s5, s7
+; GFX11-NEXT: s_add_i32 s0, s0, s2
+; GFX11-NEXT: s_add_i32 s1, s1, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: s_add_v2i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_co_i32 s2, s4, s6
-; GFX12-NEXT: s_add_co_i32 s3, s5, s7
+; GFX12-NEXT: s_add_co_i32 s0, s0, s2
+; GFX12-NEXT: s_add_co_i32 s1, s1, s3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -342,42 +342,42 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
; GFX8-LABEL: s_add_v8i32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_i32 s7, s7, s15
; GFX8-NEXT: s_add_i32 s6, s6, s14
; GFX8-NEXT: s_add_i32 s5, s5, s13
; GFX8-NEXT: s_add_i32 s4, s4, s12
-; GFX8-NEXT: s_add_i32 s2, s11, s19
-; GFX8-NEXT: s_add_i32 s3, s10, s18
+; GFX8-NEXT: s_add_i32 s0, s11, s19
+; GFX8-NEXT: s_add_i32 s1, s10, s18
; GFX8-NEXT: s_add_i32 s9, s9, s17
; GFX8-NEXT: s_add_i32 s8, s8, s16
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NEXT: s_add_u32 s0, s2, 16
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mov_b32_e32 v1, s9
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_add_v8i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_i32 s2, s7, s15
-; GFX9-NEXT: s_add_i32 s3, s6, s14
+; GFX9-NEXT: s_add_i32 s0, s7, s15
+; GFX9-NEXT: s_add_i32 s1, s6, s14
; GFX9-NEXT: s_add_i32 s6, s11, s19
; GFX9-NEXT: s_add_i32 s7, s10, s18
; GFX9-NEXT: s_add_i32 s9, s9, s17
@@ -388,23 +388,24 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_add_v8i32:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v8, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_i32 s2, s7, s15
-; GFX10-NEXT: s_add_i32 s3, s6, s14
+; GFX10-NEXT: s_add_i32 s0, s7, s15
+; GFX10-NEXT: s_add_i32 s1, s6, s14
; GFX10-NEXT: s_add_i32 s6, s11, s19
; GFX10-NEXT: s_add_i32 s7, s10, s18
; GFX10-NEXT: s_add_i32 s8, s8, s16
@@ -417,20 +418,20 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
; GFX10-NEXT: v_mov_b32_e32 v3, s6
; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: v_mov_b32_e32 v5, s5
-; GFX10-NEXT: v_mov_b32_e32 v6, s3
-; GFX10-NEXT: v_mov_b32_e32 v7, s2
-; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
-; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v6, s1
+; GFX10-NEXT: v_mov_b32_e32 v7, s0
+; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3] offset:16
+; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_add_v8i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s2, s7, s15
-; GFX11-NEXT: s_add_i32 s3, s6, s14
+; GFX11-NEXT: s_add_i32 s0, s7, s15
+; GFX11-NEXT: s_add_i32 s1, s6, s14
; GFX11-NEXT: s_add_i32 s6, s11, s19
; GFX11-NEXT: s_add_i32 s7, s10, s18
; GFX11-NEXT: s_add_i32 s8, s8, s16
@@ -440,11 +441,11 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9
; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s6
; GFX11-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2
-; GFX11-NEXT: v_mov_b32_e32 v6, s3
+; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_mov_b32_e32 v6, s1
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
-; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX11-NEXT: global_store_b128 v8, v[0:3], s[2:3] offset:16
+; GFX11-NEXT: global_store_b128 v8, v[4:7], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -453,10 +454,10 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x44
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_co_i32 s2, s7, s15
-; GFX12-NEXT: s_add_co_i32 s3, s6, s14
+; GFX12-NEXT: s_add_co_i32 s0, s7, s15
+; GFX12-NEXT: s_add_co_i32 s1, s6, s14
; GFX12-NEXT: s_add_co_i32 s6, s11, s19
; GFX12-NEXT: s_add_co_i32 s7, s10, s18
; GFX12-NEXT: s_add_co_i32 s8, s8, s16
@@ -466,11 +467,11 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9
; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s6
; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s5
-; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2
-; GFX12-NEXT: v_mov_b32_e32 v6, s3
+; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s0
+; GFX12-NEXT: v_mov_b32_e32 v6, s1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[2:3] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -534,7 +535,7 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
; GFX8-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_i32 s7, s7, s39
; GFX8-NEXT: s_add_i32 s6, s6, s38
@@ -548,43 +549,43 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX8-NEXT: s_add_i32 s14, s14, s46
; GFX8-NEXT: s_add_i32 s13, s13, s45
; GFX8-NEXT: s_add_i32 s12, s12, s44
-; GFX8-NEXT: s_add_i32 s2, s19, s51
-; GFX8-NEXT: s_add_i32 s3, s18, s50
+; GFX8-NEXT: s_add_i32 s0, s19, s51
+; GFX8-NEXT: s_add_i32 s1, s18, s50
; GFX8-NEXT: s_add_i32 s17, s17, s49
; GFX8-NEXT: s_add_i32 s16, s16, s48
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NEXT: s_add_u32 s0, s2, 48
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: s_add_u32 s0, s2, 32
; GFX8-NEXT: v_mov_b32_e32 v0, s16
; GFX8-NEXT: v_mov_b32_e32 v1, s17
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: s_add_u32 s0, s2, 16
; GFX8-NEXT: v_mov_b32_e32 v0, s12
; GFX8-NEXT: v_mov_b32_e32 v1, s13
; GFX8-NEXT: v_mov_b32_e32 v2, s14
; GFX8-NEXT: v_mov_b32_e32 v3, s15
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NEXT: v_mov_b32_e32 v2, s10
; GFX8-NEXT: v_mov_b32_e32 v3, s11
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
@@ -592,11 +593,11 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
; GFX9-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_i32 s2, s7, s39
-; GFX9-NEXT: s_add_i32 s3, s6, s38
+; GFX9-NEXT: s_add_i32 s0, s7, s39
+; GFX9-NEXT: s_add_i32 s1, s6, s38
; GFX9-NEXT: s_add_i32 s6, s11, s43
; GFX9-NEXT: s_add_i32 s7, s10, s42
; GFX9-NEXT: s_add_i32 s10, s15, s47
@@ -613,38 +614,38 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX9-NEXT: v_mov_b32_e32 v3, s14
; GFX9-NEXT: s_add_i32 s9, s9, s41
; GFX9-NEXT: s_add_i32 s8, s8, s40
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:48
; GFX9-NEXT: s_add_i32 s5, s5, s37
; GFX9-NEXT: v_mov_b32_e32 v0, s12
; GFX9-NEXT: v_mov_b32_e32 v1, s13
; GFX9-NEXT: v_mov_b32_e32 v2, s11
; GFX9-NEXT: v_mov_b32_e32 v3, s10
; GFX9-NEXT: s_add_i32 s4, s4, s36
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:32
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_add_v16i32:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
; GFX10-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v16, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_i32 s2, s7, s39
-; GFX10-NEXT: s_add_i32 s3, s6, s38
+; GFX10-NEXT: s_add_i32 s0, s7, s39
+; GFX10-NEXT: s_add_i32 s1, s6, s38
; GFX10-NEXT: s_add_i32 s6, s11, s43
; GFX10-NEXT: s_add_i32 s7, s10, s42
; GFX10-NEXT: s_add_i32 s10, s15, s47
@@ -673,12 +674,12 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX10-NEXT: v_mov_b32_e32 v11, s6
; GFX10-NEXT: v_mov_b32_e32 v12, s4
; GFX10-NEXT: v_mov_b32_e32 v13, s5
-; GFX10-NEXT: v_mov_b32_e32 v14, s3
-; GFX10-NEXT: v_mov_b32_e32 v15, s2
-; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:48
-; GFX10-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:32
-; GFX10-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16
-; GFX10-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v14, s1
+; GFX10-NEXT: v_mov_b32_e32 v15, s0
+; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:48
+; GFX10-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:32
+; GFX10-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16
+; GFX10-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_add_v16i32:
@@ -686,10 +687,10 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x64
; GFX11-NEXT: s_load_b512 s[36:51], s[0:1], 0xa4
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s2, s7, s39
-; GFX11-NEXT: s_add_i32 s3, s6, s38
+; GFX11-NEXT: s_add_i32 s0, s7, s39
+; GFX11-NEXT: s_add_i32 s1, s6, s38
; GFX11-NEXT: s_add_i32 s6, s11, s43
; GFX11-NEXT: s_add_i32 s7, s10, s42
; GFX11-NEXT: s_add_i32 s10, s15, s47
@@ -711,13 +712,13 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX11-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v9, s9
; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s6
; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s5
-; GFX11-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2
-; GFX11-NEXT: v_mov_b32_e32 v14, s3
+; GFX11-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s0
+; GFX11-NEXT: v_mov_b32_e32 v14, s1
; GFX11-NEXT: s_clause 0x3
-; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX11-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
-; GFX11-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX11-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX11-NEXT: global_store_b128 v16, v[0:3], s[2:3] offset:48
+; GFX11-NEXT: global_store_b128 v16, v[4:7], s[2:3] offset:32
+; GFX11-NEXT: global_store_b128 v16, v[8:11], s[2:3] offset:16
+; GFX11-NEXT: global_store_b128 v16, v[12:15], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -727,10 +728,10 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX12-NEXT: s_clause 0x2
; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x64
; GFX12-NEXT: s_load_b512 s[36:51], s[0:1], 0xa4
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_co_i32 s2, s7, s39
-; GFX12-NEXT: s_add_co_i32 s3, s6, s38
+; GFX12-NEXT: s_add_co_i32 s0, s7, s39
+; GFX12-NEXT: s_add_co_i32 s1, s6, s38
; GFX12-NEXT: s_add_co_i32 s6, s11, s43
; GFX12-NEXT: s_add_co_i32 s7, s10, s42
; GFX12-NEXT: s_add_co_i32 s10, s15, s47
@@ -752,13 +753,13 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX12-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v9, s9
; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s6
; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s5
-; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2
-; GFX12-NEXT: v_mov_b32_e32 v14, s3
+; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s0
+; GFX12-NEXT: v_mov_b32_e32 v14, s1
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[2:3] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[2:3] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[2:3] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -792,11 +793,11 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; GFX8-LABEL: v_add_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -804,68 +805,68 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_load_dword v2, v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_add_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_add_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_add_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_nc_u32_e32 v0, v1, v0
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_add_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_b32 v0, v0, s[6:7] offset:4 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_nc_u32_e32 v0, v1, v0
-; GFX12-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -901,66 +902,66 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX8-LABEL: v_add_imm_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7b, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_add_imm_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v0, 0x7b, v0
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_add_imm_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_add_imm_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_add_imm_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT: global_load_b32 v0, v0, s[6:7] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -993,11 +994,11 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX8-LABEL: add64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: s_add_u32 s0, s6, s0
-; GFX8-NEXT: s_addc_u32 s1, s7, s1
+; GFX8-NEXT: s_add_u32 s0, s6, s2
+; GFX8-NEXT: s_addc_u32 s1, s7, s3
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s0
@@ -1035,10 +1036,10 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s6, s0
-; GFX11-NEXT: s_addc_u32 s1, s7, s1
+; GFX11-NEXT: s_add_u32 s0, s6, s2
+; GFX11-NEXT: s_addc_u32 s1, s7, s3
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -1050,9 +1051,9 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[2:3]
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -1090,15 +1091,15 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad
;
; GFX8-LABEL: add64_sgpr_vgpr:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_add_u32 s0, s2, s4
-; GFX8-NEXT: s_addc_u32 s1, s3, s5
+; GFX8-NEXT: s_add_u32 s0, s6, s0
+; GFX8-NEXT: s_addc_u32 s1, s7, s1
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -1138,16 +1139,16 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad
; GFX11-LABEL: add64_sgpr_vgpr:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s2, s2, s4
-; GFX11-NEXT: s_addc_u32 s3, s3, s5
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s6, s0
+; GFX11-NEXT: s_addc_u32 s1, s7, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1155,15 +1156,15 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad
; GFX12-LABEL: add64_sgpr_vgpr:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1205,123 +1206,123 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
;
; GFX8-LABEL: add64_in_branch:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b64 s[8:9], 0
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b64 s[2:3], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX8-NEXT: s_cbranch_scc0 .LBB9_4
; GFX8-NEXT: ; %bb.1: ; %else
-; GFX8-NEXT: s_add_u32 s4, s4, s6
-; GFX8-NEXT: s_addc_u32 s5, s5, s7
-; GFX8-NEXT: s_andn2_b64 vcc, exec, s[8:9]
+; GFX8-NEXT: s_add_u32 s0, s8, s10
+; GFX8-NEXT: s_addc_u32 s1, s9, s11
+; GFX8-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX8-NEXT: s_cbranch_vccnz .LBB9_3
; GFX8-NEXT: .LBB9_2: ; %if
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-NEXT: .LBB9_3: ; %endif
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
; GFX8-NEXT: .LBB9_4:
-; GFX8-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX8-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX8-NEXT: s_branch .LBB9_2
;
; GFX9-LABEL: add64_in_branch:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_cbranch_scc0 .LBB9_4
; GFX9-NEXT: ; %bb.1: ; %else
-; GFX9-NEXT: s_add_u32 s4, s4, s6
-; GFX9-NEXT: s_addc_u32 s5, s5, s7
-; GFX9-NEXT: s_andn2_b64 vcc, exec, s[8:9]
+; GFX9-NEXT: s_add_u32 s0, s8, s10
+; GFX9-NEXT: s_addc_u32 s1, s9, s11
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX9-NEXT: s_cbranch_vccnz .LBB9_3
; GFX9-NEXT: .LBB9_2: ; %if
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX9-NEXT: .LBB9_3: ; %endif
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
; GFX9-NEXT: .LBB9_4:
-; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX9-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX9-NEXT: s_branch .LBB9_2
;
; GFX10-LABEL: add64_in_branch:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX10-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX10-NEXT: s_cbranch_scc0 .LBB9_4
; GFX10-NEXT: ; %bb.1: ; %else
-; GFX10-NEXT: s_add_u32 s4, s4, s6
-; GFX10-NEXT: s_addc_u32 s5, s5, s7
+; GFX10-NEXT: s_add_u32 s0, s8, s10
+; GFX10-NEXT: s_addc_u32 s1, s9, s11
; GFX10-NEXT: s_cbranch_execnz .LBB9_3
; GFX10-NEXT: .LBB9_2: ; %if
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX10-NEXT: .LBB9_3: ; %endif
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
; GFX10-NEXT: .LBB9_4:
-; GFX10-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX10-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX10-NEXT: s_branch .LBB9_2
;
; GFX11-LABEL: add64_in_branch:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX11-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX11-NEXT: s_cbranch_scc0 .LBB9_4
; GFX11-NEXT: ; %bb.1: ; %else
-; GFX11-NEXT: s_add_u32 s4, s4, s6
-; GFX11-NEXT: s_addc_u32 s5, s5, s7
+; GFX11-NEXT: s_add_u32 s0, s8, s10
+; GFX11-NEXT: s_addc_u32 s1, s9, s11
; GFX11-NEXT: s_cbranch_execnz .LBB9_3
; GFX11-NEXT: .LBB9_2: ; %if
-; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX11-NEXT: .LBB9_3: ; %endif
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
; GFX11-NEXT: .LBB9_4:
-; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX11-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX11-NEXT: s_branch .LBB9_2
;
; GFX12-LABEL: add64_in_branch:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX12-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX12-NEXT: s_cbranch_scc0 .LBB9_4
; GFX12-NEXT: ; %bb.1: ; %else
-; GFX12-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[8:9], s[10:11]
; GFX12-NEXT: s_cbranch_execnz .LBB9_3
; GFX12-NEXT: .LBB9_2: ; %if
-; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: .LBB9_3: ; %endif
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
; GFX12-NEXT: .LBB9_4:
-; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX12-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX12-NEXT: s_branch .LBB9_2
entry:
%0 = icmp eq i64 %a, 0
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index be9b5b0..65b8db9 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -10,14 +10,14 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace
; VI-LABEL: v_test_add_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -66,13 +66,13 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
; GFX11-NEXT: global_store_b32 v2, v0, s[4:5]
@@ -94,19 +94,19 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace
; VI-LABEL: s_test_add_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s1, s2, 16
-; VI-NEXT: s_lshr_b32 s3, s0, 16
-; VI-NEXT: s_add_i32 s2, s2, s0
-; VI-NEXT: s_add_i32 s1, s1, s3
-; VI-NEXT: s_and_b32 s0, s2, 0xffff
-; VI-NEXT: s_lshl_b32 s1, s1, 16
+; VI-NEXT: s_lshr_b32 s2, s0, 16
+; VI-NEXT: s_lshr_b32 s3, s1, 16
+; VI-NEXT: s_add_i32 s0, s0, s1
+; VI-NEXT: s_add_i32 s2, s2, s3
+; VI-NEXT: s_and_b32 s0, s0, 0xffff
+; VI-NEXT: s_lshl_b32 s1, s2, 16
; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -144,13 +144,13 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_add_u16 v1, s2, s0
+; GFX11-NEXT: v_pk_add_u16 v1, s0, s1
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -165,54 +165,54 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace
define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 {
; VI-LABEL: s_test_add_self_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: s_and_b32 s1, s2, 0xffff
-; VI-NEXT: s_add_i32 s1, s1, s1
+; VI-NEXT: s_lshr_b32 s1, s0, 16
+; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: s_add_i32 s0, s0, s0
-; VI-NEXT: s_lshl_b32 s0, s0, 16
-; VI-NEXT: s_and_b32 s1, s1, 0xffff
-; VI-NEXT: s_or_b32 s0, s1, s0
+; VI-NEXT: s_add_i32 s1, s1, s1
+; VI-NEXT: s_lshl_b32 s1, s1, 16
+; VI-NEXT: s_and_b32 s0, s0, 0xffff
+; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_add_self_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_add_u16 v1, s2, s2
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: v_pk_add_u16 v1, s0, s0
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_add_self_v2i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_add_u16 v1, s2, s2
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: v_pk_add_u16 v1, s0, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_add_self_v2i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_add_u16 v1, s2, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_pk_add_u16 v1, s0, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -279,17 +279,17 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x
define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; VI-LABEL: v_test_add_v2i16_constant:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v3, 0x1c8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u16_e32 v4, 0x7b, v2
; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v4, v2
@@ -298,38 +298,38 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_test_add_v2i16_constant:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_mov_b32 s0, 0x1c8007b
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s2, 0x1c8007b
-; GFX9-NEXT: v_pk_add_u16 v0, v0, s2
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: v_pk_add_u16 v0, v0, s0
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_constant:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, 0x1c8007b, v0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_constant:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, 0x1c8007b, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -346,17 +346,17 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr
define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; VI-LABEL: v_test_add_v2i16_neg_constant:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v3, 0xfffffc21
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u16_e32 v4, 0xfcb3, v2
; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v4, v2
@@ -365,38 +365,38 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_test_add_v2i16_neg_constant:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_mov_b32 s0, 0xfc21fcb3
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s2, 0xfc21fcb3
-; GFX9-NEXT: v_pk_add_u16 v0, v0, s2
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: v_pk_add_u16 v0, v0, s0
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_neg_constant:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, 0xfc21fcb3, v0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_neg_constant:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, 0xfc21fcb3, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -412,17 +412,17 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out,
define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; VI-LABEL: v_test_add_v2i16_inline_neg1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v3, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u16_e32 v4, -1, v2
; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v4, v2
@@ -431,37 +431,37 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p
;
; GFX9-LABEL: v_test_add_v2i16_inline_neg1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v0, v0, -1
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_inline_neg1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, v0, -1
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_inline_neg1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v0, -1
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -477,16 +477,16 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p
define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; VI-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; VI-NEXT: v_add_u16_e32 v2, 32, v2
; VI-NEXT: v_or_b32_e32 v2, v2, v3
@@ -495,37 +495,37 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v0, v0, 32
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, v0, 32
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v0, 32
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -542,17 +542,17 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %
define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; VI-LABEL: v_test_add_v2i16_inline_fp_split:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v3, 0x3f80
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u16_sdwa v3, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -560,37 +560,37 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_test_add_v2i16_inline_fp_split:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v0, v0, 1.0
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_inline_fp_split:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, v0, 1.0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_inline_fp_split:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v0, 1.0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -608,14 +608,14 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out,
; VI-LABEL: v_test_add_v2i16_zext_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -667,13 +667,13 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -700,14 +700,14 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; VI-LABEL: v_test_add_v2i16_zext_to_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v6, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -763,12 +763,12 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
@@ -796,14 +796,14 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out,
; VI-LABEL: v_test_add_v2i16_sext_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -857,13 +857,13 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -890,14 +890,14 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out,
; VI-LABEL: v_test_add_v2i16_sext_to_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
@@ -957,13 +957,13 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
diff --git a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
index 330cf48..46379da 100644
--- a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
+++ b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
@@ -46,11 +46,11 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) {
; GFX9-NEXT: s_cmp_lt_i32 s2, 1
; GFX9-NEXT: s_cbranch_scc0 .LBB2_2
; GFX9-NEXT: ; %bb.1: ; %else
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_store_dword v[0:1], v2
; GFX9-NEXT: s_endpgm
; GFX9-NEXT: .LBB2_2: ; %then
@@ -63,11 +63,11 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) {
; GFX10-NEXT: s_cmp_lt_i32 s2, 1
; GFX10-NEXT: s_cbranch_scc0 .LBB2_2
; GFX10-NEXT: ; %bb.1: ; %else
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: flat_store_dword v[0:1], v2
; GFX10-NEXT: s_endpgm
; GFX10-NEXT: .LBB2_2: ; %then
@@ -80,10 +80,10 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) {
; GFX11-NEXT: s_cmp_lt_i32 s2, 1
; GFX11-NEXT: s_cbranch_scc0 .LBB2_2
; GFX11-NEXT: ; %bb.1: ; %else
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
; GFX11-NEXT: .LBB2_2: ; %then
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
index 77976e4..95f5947 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
@@ -72,12 +72,12 @@ define amdgpu_ps void @test_sgpr_plus_imm_offset(ptr addrspace(4) inreg %base, i
; SDAG-DAG: %[[BASE1:.*]]:sgpr_32 = COPY $sgpr1
; SDAG-DAG: %[[OFFSET:.*]]:sgpr_32 = COPY $sgpr2
; SDAG-DAG: %[[BASE:.*]]:sgpr_64 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1
-; SDAG: S_LOAD_DWORDX2_SGPR_IMM killed %[[BASE]], %[[OFFSET]], 16,
+; SDAG: S_LOAD_DWORDX2_SGPR_IMM_ec killed %[[BASE]], %[[OFFSET]], 16,
; GISEL-DAG: %[[BASE0:.*]]:sreg_32 = COPY $sgpr0
; GISEL-DAG: %[[BASE1:.*]]:sreg_32 = COPY $sgpr1
; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = COPY $sgpr2
; GISEL-DAG: %[[BASE:.*]]:sreg_64 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1
-; GISEL: S_LOAD_DWORDX2_SGPR_IMM %[[BASE]], %[[OFFSET]], 16,
+; GISEL: S_LOAD_DWORDX2_SGPR_IMM_ec %[[BASE]], %[[OFFSET]], 16,
define amdgpu_ps void @test_sgpr_plus_imm_offset_x2(ptr addrspace(4) inreg %base, i32 inreg %offset,
ptr addrspace(1) inreg %out) {
%v1 = getelementptr i8, ptr addrspace(4) %base, i64 16
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 559871d..e45acee 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -475,12 +475,12 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshr_b32 s3, s2, 16
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0
; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2
; GFX9-NEXT: v_trunc_f32_e32 v2, v2
@@ -488,7 +488,8 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
-; GFX9-NEXT: global_store_short v3, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_short v3, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = udiv i16 %x, %y
store i16 %r, ptr addrspace(1) %out
@@ -544,13 +545,13 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
;
; GFX9-LABEL: urem_i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b32 s3, s2, 16
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT: s_and_b32 s4, s2, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_lshr_b32 s5, s4, 16
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5
+; GFX9-NEXT: s_and_b32 s2, s4, 0xffff
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0
; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2
; GFX9-NEXT: v_trunc_f32_e32 v2, v2
@@ -559,10 +560,10 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5
+; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9-NEXT: global_store_short v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = urem i16 %x, %y
store i16 %r, ptr addrspace(1) %out
@@ -709,29 +710,28 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
; GFX9-LABEL: srem_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s5, s4, 16
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5
-; GFX9-NEXT: s_sext_i32_i16 s2, s4
-; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2
-; GFX9-NEXT: s_xor_b32 s2, s2, s5
+; GFX9-NEXT: s_sext_i32_i16 s0, s4
+; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0
+; GFX9-NEXT: s_xor_b32 s0, s0, s5
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0
-; GFX9-NEXT: s_ashr_i32 s2, s2, 30
-; GFX9-NEXT: s_or_b32 s6, s2, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_ashr_i32 s0, s0, 30
+; GFX9-NEXT: s_or_b32 s6, s0, 1
; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2
; GFX9-NEXT: v_trunc_f32_e32 v2, v2
; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1
; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
-; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
-; GFX9-NEXT: s_cselect_b32 s2, s6, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v2
+; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, s6, 0
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v2
; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9-NEXT: global_store_short v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = srem i16 %x, %y
store i16 %r, ptr addrspace(1) %out
@@ -781,20 +781,20 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
;
; GFX9-LABEL: udiv_i8:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2
+; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s4
; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1
; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v3
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
-; GFX9-NEXT: global_store_byte v2, v0, s[0:1]
+; GFX9-NEXT: global_store_byte v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = udiv i8 %x, %y
store i8 %r, ptr addrspace(1) %out
@@ -849,13 +849,13 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
;
; GFX9-LABEL: urem_i8:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2
+; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s2
-; GFX9-NEXT: s_lshr_b32 s3, s2, 8
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s4
+; GFX9-NEXT: s_lshr_b32 s0, s4, 8
; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1
@@ -863,10 +863,9 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0
+; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT: global_store_byte v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = urem i8 %x, %y
store i8 %r, ptr addrspace(1) %out
@@ -1277,12 +1276,12 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-LABEL: udiv_v4i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9
-; GFX9-NEXT: s_sub_i32 s2, 0, s8
+; GFX9-NEXT: s_sub_i32 s0, 0, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s10
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
@@ -1290,40 +1289,40 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s3, v0
-; GFX9-NEXT: s_mul_i32 s2, s2, s3
-; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2
-; GFX9-NEXT: s_add_i32 s3, s3, s2
-; GFX9-NEXT: s_mul_hi_u32 s2, s4, s3
-; GFX9-NEXT: s_mul_i32 s3, s2, s8
-; GFX9-NEXT: s_sub_i32 s3, s4, s3
-; GFX9-NEXT: s_add_i32 s13, s2, 1
-; GFX9-NEXT: s_sub_i32 s4, s3, s8
-; GFX9-NEXT: s_cmp_ge_u32 s3, s8
-; GFX9-NEXT: s_cselect_b32 s2, s13, s2
-; GFX9-NEXT: s_cselect_b32 s3, s4, s3
-; GFX9-NEXT: s_add_i32 s4, s2, 1
-; GFX9-NEXT: s_cmp_ge_u32 s3, s8
+; GFX9-NEXT: v_readfirstlane_b32 s1, v0
+; GFX9-NEXT: s_mul_i32 s0, s0, s1
+; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0
+; GFX9-NEXT: s_add_i32 s1, s1, s0
+; GFX9-NEXT: s_mul_hi_u32 s0, s4, s1
+; GFX9-NEXT: s_mul_i32 s1, s0, s8
+; GFX9-NEXT: s_sub_i32 s1, s4, s1
+; GFX9-NEXT: s_add_i32 s13, s0, 1
+; GFX9-NEXT: s_sub_i32 s4, s1, s8
+; GFX9-NEXT: s_cmp_ge_u32 s1, s8
+; GFX9-NEXT: s_cselect_b32 s0, s13, s0
+; GFX9-NEXT: s_cselect_b32 s1, s4, s1
+; GFX9-NEXT: s_add_i32 s4, s0, 1
+; GFX9-NEXT: s_cmp_ge_u32 s1, s8
; GFX9-NEXT: v_readfirstlane_b32 s12, v1
-; GFX9-NEXT: s_cselect_b32 s2, s4, s2
-; GFX9-NEXT: s_sub_i32 s3, 0, s9
-; GFX9-NEXT: s_mul_i32 s3, s3, s12
-; GFX9-NEXT: s_mul_hi_u32 s3, s12, s3
-; GFX9-NEXT: s_add_i32 s12, s12, s3
+; GFX9-NEXT: s_cselect_b32 s0, s4, s0
+; GFX9-NEXT: s_sub_i32 s1, 0, s9
+; GFX9-NEXT: s_mul_i32 s1, s1, s12
+; GFX9-NEXT: s_mul_hi_u32 s1, s12, s1
+; GFX9-NEXT: s_add_i32 s12, s12, s1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v2
-; GFX9-NEXT: s_mul_hi_u32 s3, s5, s12
-; GFX9-NEXT: s_mul_i32 s4, s3, s9
+; GFX9-NEXT: s_mul_hi_u32 s1, s5, s12
+; GFX9-NEXT: s_mul_i32 s4, s1, s9
; GFX9-NEXT: s_sub_i32 s4, s5, s4
-; GFX9-NEXT: s_add_i32 s8, s3, 1
+; GFX9-NEXT: s_add_i32 s8, s1, 1
; GFX9-NEXT: s_sub_i32 s5, s4, s9
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: s_cmp_ge_u32 s4, s9
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: s_cselect_b32 s3, s8, s3
+; GFX9-NEXT: s_cselect_b32 s1, s8, s1
; GFX9-NEXT: s_cselect_b32 s4, s5, s4
-; GFX9-NEXT: s_add_i32 s5, s3, 1
+; GFX9-NEXT: s_add_i32 s5, s1, 1
; GFX9-NEXT: s_cmp_ge_u32 s4, s9
-; GFX9-NEXT: s_cselect_b32 s3, s5, s3
+; GFX9-NEXT: s_cselect_b32 s1, s5, s1
; GFX9-NEXT: v_readfirstlane_b32 s5, v0
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s11
; GFX9-NEXT: s_sub_i32 s4, 0, s10
@@ -1360,11 +1359,11 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-NEXT: s_add_i32 s7, s5, 1
; GFX9-NEXT: s_cmp_ge_u32 s6, s11
; GFX9-NEXT: s_cselect_b32 s5, s7, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
%r = udiv <4 x i32> %x, %y
store <4 x i32> %r, ptr addrspace(1) %out
@@ -1585,12 +1584,12 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-LABEL: urem_v4i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9
-; GFX9-NEXT: s_sub_i32 s2, 0, s8
+; GFX9-NEXT: s_sub_i32 s0, 0, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s10
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
@@ -1600,35 +1599,35 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GFX9-NEXT: v_readfirstlane_b32 s3, v0
-; GFX9-NEXT: s_mul_i32 s2, s2, s3
-; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2
-; GFX9-NEXT: s_add_i32 s3, s3, s2
-; GFX9-NEXT: s_mul_hi_u32 s2, s4, s3
-; GFX9-NEXT: s_mul_i32 s2, s2, s8
-; GFX9-NEXT: s_sub_i32 s2, s4, s2
-; GFX9-NEXT: s_sub_i32 s3, s2, s8
-; GFX9-NEXT: s_cmp_ge_u32 s2, s8
-; GFX9-NEXT: s_cselect_b32 s2, s3, s2
-; GFX9-NEXT: s_sub_i32 s3, s2, s8
-; GFX9-NEXT: s_cmp_ge_u32 s2, s8
+; GFX9-NEXT: v_readfirstlane_b32 s1, v0
+; GFX9-NEXT: s_mul_i32 s0, s0, s1
+; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0
+; GFX9-NEXT: s_add_i32 s1, s1, s0
+; GFX9-NEXT: s_mul_hi_u32 s0, s4, s1
+; GFX9-NEXT: s_mul_i32 s0, s0, s8
+; GFX9-NEXT: s_sub_i32 s0, s4, s0
+; GFX9-NEXT: s_sub_i32 s1, s0, s8
+; GFX9-NEXT: s_cmp_ge_u32 s0, s8
+; GFX9-NEXT: s_cselect_b32 s0, s1, s0
+; GFX9-NEXT: s_sub_i32 s1, s0, s8
+; GFX9-NEXT: s_cmp_ge_u32 s0, s8
; GFX9-NEXT: v_readfirstlane_b32 s12, v1
-; GFX9-NEXT: s_cselect_b32 s2, s3, s2
-; GFX9-NEXT: s_sub_i32 s3, 0, s9
-; GFX9-NEXT: s_mul_i32 s3, s3, s12
-; GFX9-NEXT: s_mul_hi_u32 s3, s12, s3
-; GFX9-NEXT: s_add_i32 s12, s12, s3
-; GFX9-NEXT: s_mul_hi_u32 s3, s5, s12
-; GFX9-NEXT: s_mul_i32 s3, s3, s9
-; GFX9-NEXT: s_sub_i32 s3, s5, s3
-; GFX9-NEXT: s_sub_i32 s4, s3, s9
+; GFX9-NEXT: s_cselect_b32 s0, s1, s0
+; GFX9-NEXT: s_sub_i32 s1, 0, s9
+; GFX9-NEXT: s_mul_i32 s1, s1, s12
+; GFX9-NEXT: s_mul_hi_u32 s1, s12, s1
+; GFX9-NEXT: s_add_i32 s12, s12, s1
+; GFX9-NEXT: s_mul_hi_u32 s1, s5, s12
+; GFX9-NEXT: s_mul_i32 s1, s1, s9
+; GFX9-NEXT: s_sub_i32 s1, s5, s1
+; GFX9-NEXT: s_sub_i32 s4, s1, s9
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: s_cmp_ge_u32 s3, s9
-; GFX9-NEXT: s_cselect_b32 s3, s4, s3
-; GFX9-NEXT: s_sub_i32 s4, s3, s9
-; GFX9-NEXT: s_cmp_ge_u32 s3, s9
+; GFX9-NEXT: s_cmp_ge_u32 s1, s9
+; GFX9-NEXT: s_cselect_b32 s1, s4, s1
+; GFX9-NEXT: s_sub_i32 s4, s1, s9
+; GFX9-NEXT: s_cmp_ge_u32 s1, s9
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s11
-; GFX9-NEXT: s_cselect_b32 s3, s4, s3
+; GFX9-NEXT: s_cselect_b32 s1, s4, s1
; GFX9-NEXT: s_sub_i32 s4, 0, s10
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: s_mul_i32 s4, s4, s5
@@ -1660,11 +1659,11 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-NEXT: s_sub_i32 s6, s5, s11
; GFX9-NEXT: s_cmp_ge_u32 s5, s11
; GFX9-NEXT: s_cselect_b32 s5, s6, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
%r = urem <4 x i32> %x, %y
store <4 x i32> %r, ptr addrspace(1) %out
@@ -1966,7 +1965,6 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_abs_i32 s2, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
@@ -1998,85 +1996,87 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-NEXT: s_xor_b32 s8, s5, s9
; GFX9-NEXT: s_sub_i32 s9, 0, s4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_sub_i32 s2, s2, s3
+; GFX9-NEXT: s_sub_i32 s12, s2, s3
; GFX9-NEXT: s_abs_i32 s5, s5
; GFX9-NEXT: s_ashr_i32 s8, s8, 31
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s3, v0
-; GFX9-NEXT: s_mul_i32 s9, s9, s3
-; GFX9-NEXT: s_mul_hi_u32 s9, s3, s9
-; GFX9-NEXT: s_add_i32 s3, s3, s9
-; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3
-; GFX9-NEXT: s_mul_i32 s9, s3, s4
-; GFX9-NEXT: s_sub_i32 s5, s5, s9
-; GFX9-NEXT: s_add_i32 s12, s3, 1
-; GFX9-NEXT: s_sub_i32 s9, s5, s4
-; GFX9-NEXT: s_cmp_ge_u32 s5, s4
-; GFX9-NEXT: s_cselect_b32 s3, s12, s3
-; GFX9-NEXT: s_cselect_b32 s5, s9, s5
-; GFX9-NEXT: s_add_i32 s9, s3, 1
-; GFX9-NEXT: s_cmp_ge_u32 s5, s4
-; GFX9-NEXT: s_cselect_b32 s3, s9, s3
-; GFX9-NEXT: s_abs_i32 s4, s10
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
-; GFX9-NEXT: s_xor_b32 s3, s3, s8
-; GFX9-NEXT: s_sub_i32 s9, 0, s4
-; GFX9-NEXT: s_sub_i32 s3, s3, s8
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: s_mul_i32 s9, s9, s2
+; GFX9-NEXT: s_mul_hi_u32 s3, s2, s9
+; GFX9-NEXT: s_add_i32 s2, s2, s3
+; GFX9-NEXT: s_mul_hi_u32 s2, s5, s2
+; GFX9-NEXT: s_mul_i32 s3, s2, s4
+; GFX9-NEXT: s_sub_i32 s3, s5, s3
+; GFX9-NEXT: s_add_i32 s9, s2, 1
+; GFX9-NEXT: s_sub_i32 s5, s3, s4
+; GFX9-NEXT: s_cmp_ge_u32 s3, s4
+; GFX9-NEXT: s_cselect_b32 s2, s9, s2
+; GFX9-NEXT: s_cselect_b32 s3, s5, s3
+; GFX9-NEXT: s_add_i32 s5, s2, 1
+; GFX9-NEXT: s_cmp_ge_u32 s3, s4
+; GFX9-NEXT: s_cselect_b32 s2, s5, s2
+; GFX9-NEXT: s_abs_i32 s3, s10
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT: s_xor_b32 s2, s2, s8
+; GFX9-NEXT: s_xor_b32 s4, s6, s10
+; GFX9-NEXT: s_abs_i32 s5, s6
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_xor_b32 s5, s6, s10
-; GFX9-NEXT: s_abs_i32 s6, s6
-; GFX9-NEXT: s_ashr_i32 s5, s5, 31
+; GFX9-NEXT: s_sub_i32 s6, 0, s3
+; GFX9-NEXT: s_sub_i32 s8, s2, s8
+; GFX9-NEXT: s_ashr_i32 s4, s4, 31
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_readfirstlane_b32 s8, v0
-; GFX9-NEXT: s_mul_i32 s9, s9, s8
-; GFX9-NEXT: s_mul_hi_u32 s9, s8, s9
-; GFX9-NEXT: s_add_i32 s8, s8, s9
-; GFX9-NEXT: s_mul_hi_u32 s8, s6, s8
-; GFX9-NEXT: s_mul_i32 s9, s8, s4
-; GFX9-NEXT: s_sub_i32 s6, s6, s9
-; GFX9-NEXT: s_add_i32 s10, s8, 1
-; GFX9-NEXT: s_sub_i32 s9, s6, s4
-; GFX9-NEXT: s_cmp_ge_u32 s6, s4
-; GFX9-NEXT: s_cselect_b32 s8, s10, s8
-; GFX9-NEXT: s_cselect_b32 s6, s9, s6
-; GFX9-NEXT: s_add_i32 s9, s8, 1
-; GFX9-NEXT: s_cmp_ge_u32 s6, s4
-; GFX9-NEXT: s_cselect_b32 s4, s9, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: s_mul_i32 s6, s6, s2
+; GFX9-NEXT: s_mul_hi_u32 s6, s2, s6
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_mul_hi_u32 s2, s5, s2
+; GFX9-NEXT: s_mul_i32 s6, s2, s3
+; GFX9-NEXT: s_sub_i32 s5, s5, s6
+; GFX9-NEXT: s_add_i32 s9, s2, 1
+; GFX9-NEXT: s_sub_i32 s6, s5, s3
+; GFX9-NEXT: s_cmp_ge_u32 s5, s3
+; GFX9-NEXT: s_cselect_b32 s2, s9, s2
+; GFX9-NEXT: s_cselect_b32 s5, s6, s5
+; GFX9-NEXT: s_add_i32 s6, s2, 1
+; GFX9-NEXT: s_cmp_ge_u32 s5, s3
+; GFX9-NEXT: s_cselect_b32 s5, s6, s2
; GFX9-NEXT: s_abs_i32 s6, s11
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6
-; GFX9-NEXT: s_xor_b32 s4, s4, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: s_xor_b32 s2, s7, s11
+; GFX9-NEXT: s_xor_b32 s5, s5, s4
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_xor_b32 s0, s7, s11
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GFX9-NEXT: s_abs_i32 s3, s7
+; GFX9-NEXT: s_abs_i32 s1, s7
; GFX9-NEXT: s_sub_i32 s7, 0, s6
-; GFX9-NEXT: s_sub_i32 s4, s4, s5
+; GFX9-NEXT: s_sub_i32 s4, s5, s4
; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: s_ashr_i32 s2, s2, 31
+; GFX9-NEXT: s_ashr_i32 s0, s0, 31
+; GFX9-NEXT: v_mov_b32_e32 v0, s12
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: s_mul_i32 s7, s7, s5
; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7
; GFX9-NEXT: s_add_i32 s5, s5, s7
-; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5
+; GFX9-NEXT: s_mul_hi_u32 s5, s1, s5
; GFX9-NEXT: s_mul_i32 s7, s5, s6
-; GFX9-NEXT: s_sub_i32 s3, s3, s7
+; GFX9-NEXT: s_sub_i32 s1, s1, s7
; GFX9-NEXT: s_add_i32 s8, s5, 1
-; GFX9-NEXT: s_sub_i32 s7, s3, s6
-; GFX9-NEXT: s_cmp_ge_u32 s3, s6
+; GFX9-NEXT: s_sub_i32 s7, s1, s6
+; GFX9-NEXT: s_cmp_ge_u32 s1, s6
; GFX9-NEXT: s_cselect_b32 s5, s8, s5
-; GFX9-NEXT: s_cselect_b32 s3, s7, s3
+; GFX9-NEXT: s_cselect_b32 s1, s7, s1
; GFX9-NEXT: s_add_i32 s7, s5, 1
-; GFX9-NEXT: s_cmp_ge_u32 s3, s6
-; GFX9-NEXT: s_cselect_b32 s3, s7, s5
-; GFX9-NEXT: s_xor_b32 s3, s3, s2
-; GFX9-NEXT: s_sub_i32 s2, s3, s2
+; GFX9-NEXT: s_cmp_ge_u32 s1, s6
+; GFX9-NEXT: s_cselect_b32 s1, s7, s5
+; GFX9-NEXT: s_xor_b32 s1, s1, s0
+; GFX9-NEXT: s_sub_i32 s0, s1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
%r = sdiv <4 x i32> %x, %y
store <4 x i32> %r, ptr addrspace(1) %out
@@ -2350,7 +2350,6 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_abs_i32 s2, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
@@ -2377,78 +2376,80 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
; GFX9-NEXT: s_xor_b32 s2, s2, s3
; GFX9-NEXT: s_sub_i32 s9, 0, s4
-; GFX9-NEXT: s_sub_i32 s2, s2, s3
+; GFX9-NEXT: s_sub_i32 s12, s2, s3
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: s_ashr_i32 s8, s5, 31
; GFX9-NEXT: s_abs_i32 s5, s5
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s3, v0
-; GFX9-NEXT: s_mul_i32 s9, s9, s3
-; GFX9-NEXT: s_mul_hi_u32 s9, s3, s9
-; GFX9-NEXT: s_add_i32 s3, s3, s9
-; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3
-; GFX9-NEXT: s_mul_i32 s3, s3, s4
-; GFX9-NEXT: s_sub_i32 s3, s5, s3
-; GFX9-NEXT: s_sub_i32 s5, s3, s4
-; GFX9-NEXT: s_cmp_ge_u32 s3, s4
-; GFX9-NEXT: s_cselect_b32 s3, s5, s3
-; GFX9-NEXT: s_sub_i32 s5, s3, s4
-; GFX9-NEXT: s_cmp_ge_u32 s3, s4
-; GFX9-NEXT: s_cselect_b32 s3, s5, s3
-; GFX9-NEXT: s_abs_i32 s4, s10
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
-; GFX9-NEXT: s_xor_b32 s3, s3, s8
-; GFX9-NEXT: s_sub_i32 s9, 0, s4
-; GFX9-NEXT: s_sub_i32 s3, s3, s8
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: s_mul_i32 s9, s9, s2
+; GFX9-NEXT: s_mul_hi_u32 s3, s2, s9
+; GFX9-NEXT: s_add_i32 s2, s2, s3
+; GFX9-NEXT: s_mul_hi_u32 s2, s5, s2
+; GFX9-NEXT: s_mul_i32 s2, s2, s4
+; GFX9-NEXT: s_sub_i32 s2, s5, s2
+; GFX9-NEXT: s_sub_i32 s3, s2, s4
+; GFX9-NEXT: s_cmp_ge_u32 s2, s4
+; GFX9-NEXT: s_cselect_b32 s2, s3, s2
+; GFX9-NEXT: s_sub_i32 s3, s2, s4
+; GFX9-NEXT: s_cmp_ge_u32 s2, s4
+; GFX9-NEXT: s_cselect_b32 s2, s3, s2
+; GFX9-NEXT: s_abs_i32 s3, s10
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT: s_xor_b32 s2, s2, s8
+; GFX9-NEXT: s_ashr_i32 s4, s6, 31
+; GFX9-NEXT: s_abs_i32 s5, s6
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_ashr_i32 s5, s6, 31
-; GFX9-NEXT: s_abs_i32 s6, s6
+; GFX9-NEXT: s_sub_i32 s6, 0, s3
+; GFX9-NEXT: s_sub_i32 s8, s2, s8
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s8, v0
-; GFX9-NEXT: s_mul_i32 s9, s9, s8
-; GFX9-NEXT: s_mul_hi_u32 s9, s8, s9
-; GFX9-NEXT: s_add_i32 s8, s8, s9
-; GFX9-NEXT: s_mul_hi_u32 s8, s6, s8
-; GFX9-NEXT: s_mul_i32 s8, s8, s4
-; GFX9-NEXT: s_sub_i32 s6, s6, s8
-; GFX9-NEXT: s_sub_i32 s8, s6, s4
-; GFX9-NEXT: s_cmp_ge_u32 s6, s4
-; GFX9-NEXT: s_cselect_b32 s6, s8, s6
-; GFX9-NEXT: s_sub_i32 s8, s6, s4
-; GFX9-NEXT: s_cmp_ge_u32 s6, s4
-; GFX9-NEXT: s_cselect_b32 s4, s8, s6
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: s_mul_i32 s6, s6, s2
+; GFX9-NEXT: s_mul_hi_u32 s6, s2, s6
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_mul_hi_u32 s2, s5, s2
+; GFX9-NEXT: s_mul_i32 s2, s2, s3
+; GFX9-NEXT: s_sub_i32 s2, s5, s2
+; GFX9-NEXT: s_sub_i32 s5, s2, s3
+; GFX9-NEXT: s_cmp_ge_u32 s2, s3
+; GFX9-NEXT: s_cselect_b32 s2, s5, s2
+; GFX9-NEXT: s_sub_i32 s5, s2, s3
+; GFX9-NEXT: s_cmp_ge_u32 s2, s3
+; GFX9-NEXT: s_cselect_b32 s5, s5, s2
; GFX9-NEXT: s_abs_i32 s6, s11
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6
-; GFX9-NEXT: s_xor_b32 s4, s4, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: s_ashr_i32 s2, s7, 31
+; GFX9-NEXT: s_xor_b32 s5, s5, s4
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_ashr_i32 s0, s7, 31
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v1
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: s_abs_i32 s3, s7
+; GFX9-NEXT: s_abs_i32 s1, s7
; GFX9-NEXT: s_sub_i32 s7, 0, s6
+; GFX9-NEXT: s_sub_i32 s4, s5, s4
; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: s_sub_i32 s4, s4, s5
+; GFX9-NEXT: v_mov_b32_e32 v0, s12
+; GFX9-NEXT: v_mov_b32_e32 v1, s8
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: s_mul_i32 s7, s7, s5
; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7
; GFX9-NEXT: s_add_i32 s5, s5, s7
-; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5
+; GFX9-NEXT: s_mul_hi_u32 s5, s1, s5
; GFX9-NEXT: s_mul_i32 s5, s5, s6
-; GFX9-NEXT: s_sub_i32 s3, s3, s5
-; GFX9-NEXT: s_sub_i32 s5, s3, s6
-; GFX9-NEXT: s_cmp_ge_u32 s3, s6
-; GFX9-NEXT: s_cselect_b32 s3, s5, s3
-; GFX9-NEXT: s_sub_i32 s5, s3, s6
-; GFX9-NEXT: s_cmp_ge_u32 s3, s6
-; GFX9-NEXT: s_cselect_b32 s3, s5, s3
-; GFX9-NEXT: s_xor_b32 s3, s3, s2
-; GFX9-NEXT: s_sub_i32 s2, s3, s2
+; GFX9-NEXT: s_sub_i32 s1, s1, s5
+; GFX9-NEXT: s_sub_i32 s5, s1, s6
+; GFX9-NEXT: s_cmp_ge_u32 s1, s6
+; GFX9-NEXT: s_cselect_b32 s1, s5, s1
+; GFX9-NEXT: s_sub_i32 s5, s1, s6
+; GFX9-NEXT: s_cmp_ge_u32 s1, s6
+; GFX9-NEXT: s_cselect_b32 s1, s5, s1
+; GFX9-NEXT: s_xor_b32 s1, s1, s0
+; GFX9-NEXT: s_sub_i32 s0, s1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
%r = srem <4 x i32> %x, %y
store <4 x i32> %r, ptr addrspace(1) %out
@@ -2604,7 +2605,6 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s3, s6, 0xffff
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
@@ -2617,28 +2617,29 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4
; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
-; GFX9-NEXT: s_and_b32 s2, s7, 0xffff
+; GFX9-NEXT: s_and_b32 s0, s7, 0xffff
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4
; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0
; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
-; GFX9-NEXT: s_and_b32 s2, s5, 0xffff
+; GFX9-NEXT: s_and_b32 s0, s5, 0xffff
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
; GFX9-NEXT: v_trunc_f32_e32 v2, v5
-; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4
; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1
-; GFX9-NEXT: s_lshr_b32 s2, s7, 16
+; GFX9-NEXT: s_lshr_b32 s0, s7, 16
; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5
-; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: s_lshr_b32 s2, s5, 16
-; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2
+; GFX9-NEXT: s_lshr_b32 s0, s5, 16
+; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
@@ -2654,7 +2655,8 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
-; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%r = udiv <4 x i16> %x, %y
store <4 x i16> %r, ptr addrspace(1) %out
@@ -2825,34 +2827,33 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
; GFX9-LABEL: urem_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s3, s6, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT: s_and_b32 s2, s4, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2
+; GFX9-NEXT: s_and_b32 s9, s6, 0xffff
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9
+; GFX9-NEXT: s_and_b32 s8, s4, 0xffff
; GFX9-NEXT: s_lshr_b32 s6, s6, 16
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6
+; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
; GFX9-NEXT: s_lshr_b32 s4, s4, 16
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
+; GFX9-NEXT: s_and_b32 s0, s7, 0xffff
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4
; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
-; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
-; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3
-; GFX9-NEXT: s_and_b32 s3, s7, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3
+; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0
; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5
+; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
; GFX9-NEXT: v_trunc_f32_e32 v2, v5
-; GFX9-NEXT: s_and_b32 s8, s5, 0xffff
+; GFX9-NEXT: s_and_b32 s1, s5, 0xffff
+; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s8
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
@@ -2867,24 +2868,25 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8
; GFX9-NEXT: v_trunc_f32_e32 v3, v3
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3
; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5
-; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3
+; GFX9-NEXT: v_mul_lo_u32 v2, v2, s0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
; GFX9-NEXT: v_mul_lo_u32 v3, v3, s6
+; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0
; GFX9-NEXT: v_sub_u32_e32 v4, s4, v1
-; GFX9-NEXT: v_sub_u32_e32 v1, s8, v2
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT: v_sub_u32_e32 v1, s1, v2
; GFX9-NEXT: v_sub_u32_e32 v2, s5, v3
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0
-; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%r = urem <4 x i16> %x, %y
store <4 x i16> %r, ptr addrspace(1) %out
@@ -3563,27 +3565,27 @@ define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
;
; GFX9-LABEL: urem_i3:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_bfe_u32 s3, s2, 0x30008
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX9-NEXT: s_bfe_u32 s2, s4, 0x30008
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
-; GFX9-NEXT: s_and_b32 s4, s2, 7
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s4
-; GFX9-NEXT: s_lshr_b32 s3, s2, 8
+; GFX9-NEXT: s_and_b32 s3, s4, 7
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s3
+; GFX9-NEXT: s_lshr_b32 s2, s4, 8
; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1
; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
; GFX9-NEXT: v_and_b32_e32 v0, 7, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
+; GFX9-NEXT: global_store_byte v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = urem i3 %x, %y
store i3 %r, ptr addrspace(1) %out
@@ -3753,12 +3755,12 @@ define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
; GFX9-NEXT: s_cselect_b32 s2, s6, 0
; GFX9-NEXT: v_add_u32_e32 v0, s2, v2
; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
; GFX9-NEXT: v_and_b32_e32 v0, 7, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
+; GFX9-NEXT: global_store_byte v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = srem i3 %x, %y
store i3 %r, ptr addrspace(1) %out
@@ -3881,7 +3883,6 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s3, s6, 0xffff
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
@@ -3894,19 +3895,20 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
-; GFX9-NEXT: s_and_b32 s2, s7, 0xffff
+; GFX9-NEXT: s_and_b32 s0, s7, 0xffff
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4
; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0
; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
; GFX9-NEXT: v_trunc_f32_e32 v2, v5
-; GFX9-NEXT: s_and_b32 s2, s5, 0xffff
+; GFX9-NEXT: s_and_b32 s0, s5, 0xffff
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
@@ -3918,8 +3920,9 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-NEXT: global_store_short v6, v2, s[0:1] offset:4
-; GFX9-NEXT: global_store_dword v6, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_short v6, v2, s[2:3] offset:4
+; GFX9-NEXT: global_store_dword v6, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = udiv <3 x i16> %x, %y
store <3 x i16> %r, ptr addrspace(1) %out
@@ -4053,32 +4056,32 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX9-LABEL: urem_v3i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s3, s6, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT: s_and_b32 s2, s4, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2
+; GFX9-NEXT: s_and_b32 s9, s6, 0xffff
; GFX9-NEXT: s_lshr_b32 s6, s6, 16
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6
+; GFX9-NEXT: s_and_b32 s8, s4, 0xffff
; GFX9-NEXT: s_lshr_b32 s4, s4, 16
+; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
+; GFX9-NEXT: s_and_b32 s0, s7, 0xffff
; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
-; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4
-; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
-; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5
-; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
; GFX9-NEXT: v_trunc_f32_e32 v5, v5
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3
-; GFX9-NEXT: s_and_b32 s3, s7, 0xffff
+; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
+; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4
+; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
; GFX9-NEXT: v_mad_f32 v2, -v5, v1, v3
-; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3
-; GFX9-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0
+; GFX9-NEXT: s_and_b32 s1, s5, 0xffff
+; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v5
-; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc
@@ -4087,19 +4090,18 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2
; GFX9-NEXT: v_mad_f32 v2, -v2, v3, v5
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc
; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6
-; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT: v_mul_lo_u32 v2, v2, s0
+; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1
-; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2
+; GFX9-NEXT: v_sub_u32_e32 v2, s1, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4
-; GFX9-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX9-NEXT: global_store_short v3, v2, s[2:3] offset:4
+; GFX9-NEXT: global_store_dword v3, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = urem <3 x i16> %x, %y
store <3 x i16> %r, ptr addrspace(1) %out
@@ -4465,58 +4467,58 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX9-LABEL: srem_v3i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sext_i32_i16 s8, s6
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8
; GFX9-NEXT: s_sext_i32_i16 s9, s4
; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9
-; GFX9-NEXT: s_xor_b32 s2, s9, s8
+; GFX9-NEXT: s_xor_b32 s0, s9, s8
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0
-; GFX9-NEXT: s_ashr_i32 s2, s2, 30
-; GFX9-NEXT: s_or_b32 s10, s2, 1
+; GFX9-NEXT: s_ashr_i32 s0, s0, 30
+; GFX9-NEXT: s_or_b32 s10, s0, 1
; GFX9-NEXT: s_sext_i32_i16 s7, s7
; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2
; GFX9-NEXT: v_trunc_f32_e32 v2, v2
; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
-; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
-; GFX9-NEXT: s_cselect_b32 s2, s10, 0
+; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, s10, 0
; GFX9-NEXT: s_ashr_i32 s6, s6, 16
; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s6
; GFX9-NEXT: s_ashr_i32 s4, s4, 16
; GFX9-NEXT: s_sext_i32_i16 s5, s5
-; GFX9-NEXT: v_add_u32_e32 v1, s2, v2
+; GFX9-NEXT: v_add_u32_e32 v1, s0, v2
; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0
-; GFX9-NEXT: s_xor_b32 s2, s4, s6
-; GFX9-NEXT: s_ashr_i32 s2, s2, 30
+; GFX9-NEXT: s_xor_b32 s0, s4, s6
+; GFX9-NEXT: s_ashr_i32 s0, s0, 30
; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8
; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3
; GFX9-NEXT: v_trunc_f32_e32 v3, v3
; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2
-; GFX9-NEXT: s_or_b32 s8, s2, 1
+; GFX9-NEXT: s_or_b32 s8, s0, 1
; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
+; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s7
-; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
-; GFX9-NEXT: s_cselect_b32 s2, s8, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v3
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, s8, 0
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v3
; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5
; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2
-; GFX9-NEXT: s_xor_b32 s2, s5, s7
-; GFX9-NEXT: s_ashr_i32 s2, s2, 30
+; GFX9-NEXT: s_xor_b32 s0, s5, s7
+; GFX9-NEXT: s_ashr_i32 s0, s0, 30
; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6
; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3
; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4
-; GFX9-NEXT: s_or_b32 s6, s2, 1
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v2|
-; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
-; GFX9-NEXT: s_cselect_b32 s2, s6, 0
-; GFX9-NEXT: v_add_u32_e32 v2, s2, v4
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_or_b32 s6, s0, 1
+; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2|
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, s6, 0
+; GFX9-NEXT: v_add_u32_e32 v2, s0, v4
; GFX9-NEXT: v_mul_lo_u32 v2, v2, s7
; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1
; GFX9-NEXT: v_mov_b32_e32 v3, 0
@@ -4524,9 +4526,8 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4
-; GFX9-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX9-NEXT: global_store_short v3, v2, s[2:3] offset:4
+; GFX9-NEXT: global_store_dword v3, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = srem <3 x i16> %x, %y
store <3 x i16> %r, ptr addrspace(1) %out
@@ -4854,28 +4855,28 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
; GFX9-LABEL: urem_v3i15:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30
-; GFX9-NEXT: s_and_b32 s7, s0, 0x7fff
+; GFX9-NEXT: s_and_b32 s7, s2, 0x7fff
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s2, s6, 0x7fff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2
-; GFX9-NEXT: s_bfe_u32 s2, s0, 0xf000f
+; GFX9-NEXT: s_and_b32 s0, s6, 0x7fff
+; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
-; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s2
-; GFX9-NEXT: v_mov_b32_e32 v3, s0
-; GFX9-NEXT: v_alignbit_b32 v3, s1, v3, 30
+; GFX9-NEXT: v_alignbit_b32 v3, s3, v3, 30
+; GFX9-NEXT: s_bfe_u32 s3, s2, 0xf000f
+; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s3
; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5
; GFX9-NEXT: v_trunc_f32_e32 v5, v5
; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4
; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT: s_bfe_u32 s3, s6, 0xf000f
+; GFX9-NEXT: s_bfe_u32 s1, s6, 0xf000f
; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1
-; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s3
+; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3
@@ -4892,11 +4893,11 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v8
-; GFX9-NEXT: s_lshr_b32 s1, s0, 15
+; GFX9-NEXT: s_lshr_b32 s0, s2, 15
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5
-; GFX9-NEXT: v_mul_lo_u32 v4, v4, s1
+; GFX9-NEXT: v_mul_lo_u32 v4, v4, s0
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
-; GFX9-NEXT: v_mul_lo_u32 v1, v1, s0
+; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2
; GFX9-NEXT: v_mul_lo_u32 v3, v5, v3
; GFX9-NEXT: s_lshr_b32 s0, s6, 15
; GFX9-NEXT: v_sub_u32_e32 v4, s0, v4
@@ -5717,54 +5718,54 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2
-; GFX9-NEXT: s_sub_i32 s6, 0, s3
+; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
+; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s7
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
+; GFX9-NEXT: s_sub_i32 s0, 0, s6
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s7, v0
-; GFX9-NEXT: s_mul_i32 s6, s6, s7
-; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_mul_hi_u32 s6, s4, s7
-; GFX9-NEXT: s_mul_i32 s7, s6, s3
-; GFX9-NEXT: s_sub_i32 s4, s4, s7
-; GFX9-NEXT: s_add_i32 s9, s6, 1
-; GFX9-NEXT: s_sub_i32 s7, s4, s3
-; GFX9-NEXT: s_cmp_ge_u32 s4, s3
-; GFX9-NEXT: s_cselect_b32 s6, s9, s6
-; GFX9-NEXT: s_cselect_b32 s4, s7, s4
-; GFX9-NEXT: s_add_i32 s7, s6, 1
-; GFX9-NEXT: s_cmp_ge_u32 s4, s3
+; GFX9-NEXT: v_readfirstlane_b32 s1, v0
+; GFX9-NEXT: s_mul_i32 s0, s0, s1
+; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0
+; GFX9-NEXT: s_add_i32 s1, s1, s0
+; GFX9-NEXT: s_mul_hi_u32 s0, s4, s1
+; GFX9-NEXT: s_mul_i32 s1, s0, s6
+; GFX9-NEXT: s_sub_i32 s1, s4, s1
+; GFX9-NEXT: s_add_i32 s9, s0, 1
+; GFX9-NEXT: s_sub_i32 s4, s1, s6
+; GFX9-NEXT: s_cmp_ge_u32 s1, s6
+; GFX9-NEXT: s_cselect_b32 s0, s9, s0
+; GFX9-NEXT: s_cselect_b32 s1, s4, s1
+; GFX9-NEXT: s_add_i32 s4, s0, 1
+; GFX9-NEXT: s_cmp_ge_u32 s1, s6
; GFX9-NEXT: v_readfirstlane_b32 s8, v1
-; GFX9-NEXT: s_cselect_b32 s3, s7, s6
-; GFX9-NEXT: s_sub_i32 s4, 0, s2
-; GFX9-NEXT: s_mul_i32 s4, s4, s8
-; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4
-; GFX9-NEXT: s_add_i32 s8, s8, s4
-; GFX9-NEXT: s_mul_hi_u32 s4, s5, s8
-; GFX9-NEXT: s_mul_i32 s6, s4, s2
-; GFX9-NEXT: s_sub_i32 s5, s5, s6
-; GFX9-NEXT: s_add_i32 s7, s4, 1
-; GFX9-NEXT: s_sub_i32 s6, s5, s2
-; GFX9-NEXT: s_cmp_ge_u32 s5, s2
-; GFX9-NEXT: s_cselect_b32 s4, s7, s4
-; GFX9-NEXT: s_cselect_b32 s5, s6, s5
-; GFX9-NEXT: s_add_i32 s6, s4, 1
-; GFX9-NEXT: s_cmp_ge_u32 s5, s2
-; GFX9-NEXT: s_cselect_b32 s2, s6, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_cselect_b32 s0, s4, s0
+; GFX9-NEXT: s_sub_i32 s1, 0, s7
+; GFX9-NEXT: s_mul_i32 s1, s1, s8
+; GFX9-NEXT: s_mul_hi_u32 s1, s8, s1
+; GFX9-NEXT: s_add_i32 s8, s8, s1
+; GFX9-NEXT: s_mul_hi_u32 s1, s5, s8
+; GFX9-NEXT: s_mul_i32 s4, s1, s7
+; GFX9-NEXT: s_sub_i32 s4, s5, s4
+; GFX9-NEXT: s_add_i32 s6, s1, 1
+; GFX9-NEXT: s_sub_i32 s5, s4, s7
+; GFX9-NEXT: s_cmp_ge_u32 s4, s7
+; GFX9-NEXT: s_cselect_b32 s1, s6, s1
+; GFX9-NEXT: s_cselect_b32 s4, s5, s4
+; GFX9-NEXT: s_add_i32 s5, s1, 1
+; GFX9-NEXT: s_cmp_ge_u32 s4, s7
+; GFX9-NEXT: s_cselect_b32 s1, s5, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
%r = udiv <2 x i32> %x, %shl.y
@@ -6051,50 +6052,50 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-LABEL: urem_v2i32_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2
-; GFX9-NEXT: s_sub_i32 s6, 0, s3
+; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
+; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s7
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
+; GFX9-NEXT: s_sub_i32 s0, 0, s6
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s7, v0
-; GFX9-NEXT: s_mul_i32 s6, s6, s7
-; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_mul_hi_u32 s6, s4, s7
-; GFX9-NEXT: s_mul_i32 s6, s6, s3
-; GFX9-NEXT: s_sub_i32 s4, s4, s6
-; GFX9-NEXT: s_sub_i32 s6, s4, s3
-; GFX9-NEXT: s_cmp_ge_u32 s4, s3
-; GFX9-NEXT: s_cselect_b32 s4, s6, s4
-; GFX9-NEXT: s_sub_i32 s6, s4, s3
-; GFX9-NEXT: s_cmp_ge_u32 s4, s3
+; GFX9-NEXT: v_readfirstlane_b32 s1, v0
+; GFX9-NEXT: s_mul_i32 s0, s0, s1
+; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0
+; GFX9-NEXT: s_add_i32 s1, s1, s0
+; GFX9-NEXT: s_mul_hi_u32 s0, s4, s1
+; GFX9-NEXT: s_mul_i32 s0, s0, s6
+; GFX9-NEXT: s_sub_i32 s0, s4, s0
+; GFX9-NEXT: s_sub_i32 s1, s0, s6
+; GFX9-NEXT: s_cmp_ge_u32 s0, s6
+; GFX9-NEXT: s_cselect_b32 s0, s1, s0
+; GFX9-NEXT: s_sub_i32 s1, s0, s6
+; GFX9-NEXT: s_cmp_ge_u32 s0, s6
; GFX9-NEXT: v_readfirstlane_b32 s8, v1
-; GFX9-NEXT: s_cselect_b32 s3, s6, s4
-; GFX9-NEXT: s_sub_i32 s4, 0, s2
-; GFX9-NEXT: s_mul_i32 s4, s4, s8
-; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4
-; GFX9-NEXT: s_add_i32 s8, s8, s4
-; GFX9-NEXT: s_mul_hi_u32 s4, s5, s8
-; GFX9-NEXT: s_mul_i32 s4, s4, s2
-; GFX9-NEXT: s_sub_i32 s4, s5, s4
-; GFX9-NEXT: s_sub_i32 s5, s4, s2
-; GFX9-NEXT: s_cmp_ge_u32 s4, s2
-; GFX9-NEXT: s_cselect_b32 s4, s5, s4
-; GFX9-NEXT: s_sub_i32 s5, s4, s2
-; GFX9-NEXT: s_cmp_ge_u32 s4, s2
-; GFX9-NEXT: s_cselect_b32 s2, s5, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_cselect_b32 s0, s1, s0
+; GFX9-NEXT: s_sub_i32 s1, 0, s7
+; GFX9-NEXT: s_mul_i32 s1, s1, s8
+; GFX9-NEXT: s_mul_hi_u32 s1, s8, s1
+; GFX9-NEXT: s_add_i32 s8, s8, s1
+; GFX9-NEXT: s_mul_hi_u32 s1, s5, s8
+; GFX9-NEXT: s_mul_i32 s1, s1, s7
+; GFX9-NEXT: s_sub_i32 s1, s5, s1
+; GFX9-NEXT: s_sub_i32 s4, s1, s7
+; GFX9-NEXT: s_cmp_ge_u32 s1, s7
+; GFX9-NEXT: s_cselect_b32 s1, s4, s1
+; GFX9-NEXT: s_sub_i32 s4, s1, s7
+; GFX9-NEXT: s_cmp_ge_u32 s1, s7
+; GFX9-NEXT: s_cselect_b32 s1, s4, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
%r = urem <2 x i32> %x, %shl.y
@@ -6546,65 +6547,66 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s6
; GFX9-NEXT: s_abs_i32 s3, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT: s_xor_b32 s2, s4, s2
; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s7
; GFX9-NEXT: s_abs_i32 s7, s4
-; GFX9-NEXT: s_xor_b32 s2, s4, s2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_sub_i32 s4, 0, s3
-; GFX9-NEXT: s_ashr_i32 s2, s2, 31
+; GFX9-NEXT: s_ashr_i32 s4, s2, 31
+; GFX9-NEXT: s_sub_i32 s2, 0, s3
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_readfirstlane_b32 s8, v0
-; GFX9-NEXT: s_mul_i32 s4, s4, s8
-; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4
-; GFX9-NEXT: s_add_i32 s8, s8, s4
-; GFX9-NEXT: s_mul_hi_u32 s4, s7, s8
-; GFX9-NEXT: s_mul_i32 s8, s4, s3
+; GFX9-NEXT: s_mul_i32 s2, s2, s8
+; GFX9-NEXT: s_mul_hi_u32 s2, s8, s2
+; GFX9-NEXT: s_add_i32 s8, s8, s2
+; GFX9-NEXT: s_mul_hi_u32 s2, s7, s8
+; GFX9-NEXT: s_mul_i32 s8, s2, s3
; GFX9-NEXT: s_sub_i32 s7, s7, s8
-; GFX9-NEXT: s_add_i32 s9, s4, 1
+; GFX9-NEXT: s_add_i32 s9, s2, 1
; GFX9-NEXT: s_sub_i32 s8, s7, s3
; GFX9-NEXT: s_cmp_ge_u32 s7, s3
-; GFX9-NEXT: s_cselect_b32 s4, s9, s4
+; GFX9-NEXT: s_cselect_b32 s2, s9, s2
; GFX9-NEXT: s_cselect_b32 s7, s8, s7
-; GFX9-NEXT: s_add_i32 s8, s4, 1
+; GFX9-NEXT: s_add_i32 s8, s2, 1
; GFX9-NEXT: s_cmp_ge_u32 s7, s3
-; GFX9-NEXT: s_cselect_b32 s3, s8, s4
-; GFX9-NEXT: s_abs_i32 s4, s6
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
-; GFX9-NEXT: s_xor_b32 s3, s3, s2
-; GFX9-NEXT: s_sub_i32 s7, 0, s4
-; GFX9-NEXT: s_sub_i32 s2, s3, s2
+; GFX9-NEXT: s_cselect_b32 s7, s8, s2
+; GFX9-NEXT: s_abs_i32 s8, s6
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_xor_b32 s0, s5, s6
+; GFX9-NEXT: s_abs_i32 s1, s5
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_xor_b32 s6, s5, s6
-; GFX9-NEXT: s_abs_i32 s5, s5
-; GFX9-NEXT: s_ashr_i32 s6, s6, 31
+; GFX9-NEXT: s_xor_b32 s5, s7, s4
+; GFX9-NEXT: s_sub_i32 s6, 0, s8
+; GFX9-NEXT: s_sub_i32 s4, s5, s4
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s3, v0
-; GFX9-NEXT: s_mul_i32 s7, s7, s3
-; GFX9-NEXT: s_mul_hi_u32 s7, s3, s7
-; GFX9-NEXT: s_add_i32 s3, s3, s7
-; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3
-; GFX9-NEXT: s_mul_i32 s7, s3, s4
-; GFX9-NEXT: s_sub_i32 s5, s5, s7
-; GFX9-NEXT: s_add_i32 s8, s3, 1
-; GFX9-NEXT: s_sub_i32 s7, s5, s4
-; GFX9-NEXT: s_cmp_ge_u32 s5, s4
-; GFX9-NEXT: s_cselect_b32 s3, s8, s3
+; GFX9-NEXT: s_ashr_i32 s0, s0, 31
+; GFX9-NEXT: v_readfirstlane_b32 s5, v0
+; GFX9-NEXT: s_mul_i32 s6, s6, s5
+; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6
+; GFX9-NEXT: s_add_i32 s5, s5, s6
+; GFX9-NEXT: s_mul_hi_u32 s5, s1, s5
+; GFX9-NEXT: s_mul_i32 s6, s5, s8
+; GFX9-NEXT: s_sub_i32 s1, s1, s6
+; GFX9-NEXT: s_add_i32 s7, s5, 1
+; GFX9-NEXT: s_sub_i32 s6, s1, s8
+; GFX9-NEXT: s_cmp_ge_u32 s1, s8
; GFX9-NEXT: s_cselect_b32 s5, s7, s5
-; GFX9-NEXT: s_add_i32 s7, s3, 1
-; GFX9-NEXT: s_cmp_ge_u32 s5, s4
-; GFX9-NEXT: s_cselect_b32 s3, s7, s3
-; GFX9-NEXT: s_xor_b32 s3, s3, s6
-; GFX9-NEXT: s_sub_i32 s3, s3, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_cselect_b32 s1, s6, s1
+; GFX9-NEXT: s_add_i32 s6, s5, 1
+; GFX9-NEXT: s_cmp_ge_u32 s1, s8
+; GFX9-NEXT: s_cselect_b32 s1, s6, s5
+; GFX9-NEXT: s_xor_b32 s1, s1, s0
+; GFX9-NEXT: s_sub_i32 s0, s1, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
%r = sdiv <2 x i32> %x, %shl.y
@@ -6989,7 +6991,6 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s6
; GFX9-NEXT: s_abs_i32 s2, s2
@@ -7013,35 +7014,37 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cselect_b32 s4, s7, s4
; GFX9-NEXT: s_sub_i32 s7, s4, s2
; GFX9-NEXT: s_cmp_ge_u32 s4, s2
-; GFX9-NEXT: s_cselect_b32 s2, s7, s4
-; GFX9-NEXT: s_abs_i32 s3, s3
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT: s_xor_b32 s2, s2, s6
-; GFX9-NEXT: s_sub_i32 s7, 0, s3
-; GFX9-NEXT: s_sub_i32 s2, s2, s6
+; GFX9-NEXT: s_cselect_b32 s4, s7, s4
+; GFX9-NEXT: s_abs_i32 s7, s3
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7
+; GFX9-NEXT: s_xor_b32 s4, s4, s6
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_ashr_i32 s0, s5, 31
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_ashr_i32 s4, s5, 31
-; GFX9-NEXT: s_abs_i32 s5, s5
+; GFX9-NEXT: s_abs_i32 s1, s5
+; GFX9-NEXT: s_sub_i32 s5, 0, s7
+; GFX9-NEXT: s_sub_i32 s4, s4, s6
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_readfirstlane_b32 s6, v0
-; GFX9-NEXT: s_mul_i32 s7, s7, s6
-; GFX9-NEXT: s_mul_hi_u32 s7, s6, s7
-; GFX9-NEXT: s_add_i32 s6, s6, s7
-; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6
-; GFX9-NEXT: s_mul_i32 s6, s6, s3
-; GFX9-NEXT: s_sub_i32 s5, s5, s6
-; GFX9-NEXT: s_sub_i32 s6, s5, s3
-; GFX9-NEXT: s_cmp_ge_u32 s5, s3
-; GFX9-NEXT: s_cselect_b32 s5, s6, s5
-; GFX9-NEXT: s_sub_i32 s6, s5, s3
-; GFX9-NEXT: s_cmp_ge_u32 s5, s3
-; GFX9-NEXT: s_cselect_b32 s3, s6, s5
-; GFX9-NEXT: s_xor_b32 s3, s3, s4
-; GFX9-NEXT: s_sub_i32 s3, s3, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_mul_i32 s5, s5, s6
+; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5
+; GFX9-NEXT: s_add_i32 s6, s6, s5
+; GFX9-NEXT: s_mul_hi_u32 s5, s1, s6
+; GFX9-NEXT: s_mul_i32 s5, s5, s7
+; GFX9-NEXT: s_sub_i32 s1, s1, s5
+; GFX9-NEXT: s_sub_i32 s5, s1, s7
+; GFX9-NEXT: s_cmp_ge_u32 s1, s7
+; GFX9-NEXT: s_cselect_b32 s1, s5, s1
+; GFX9-NEXT: s_sub_i32 s5, s1, s7
+; GFX9-NEXT: s_cmp_ge_u32 s1, s7
+; GFX9-NEXT: s_cselect_b32 s1, s5, s1
+; GFX9-NEXT: s_xor_b32 s1, s1, s0
+; GFX9-NEXT: s_sub_i32 s0, s1, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
%r = srem <2 x i32> %x, %shl.y
@@ -7281,13 +7284,13 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
;
; GFX9-LABEL: udiv_i64_pow2k_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 12
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], 12
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
%r = udiv i64 %x, 4096
store i64 %r, ptr addrspace(1) %out
@@ -7614,18 +7617,18 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-LABEL: udiv_v2i64_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_i32 s2, s8, 12
+; GFX9-NEXT: s_add_i32 s0, s8, 12
; GFX9-NEXT: s_add_i32 s8, s10, 12
-; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], s2
+; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], s0
; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], s8
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
%shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
%r = udiv <2 x i64> %x, %shl.y
@@ -7862,12 +7865,12 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
;
; GFX9-LABEL: urem_i64_pow2k_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s2, s2, 0xfff
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT: s_and_b32 s0, s6, 0xfff
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
%r = urem i64 %x, 4096
store i64 %r, ptr addrspace(1) %out
@@ -8003,22 +8006,22 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-LABEL: urem_v2i64_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s10
+; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s10
; GFX9-NEXT: s_lshl_b64 s[8:9], 0x1000, s8
; GFX9-NEXT: s_add_u32 s8, s8, -1
; GFX9-NEXT: s_addc_u32 s9, s9, -1
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9]
-; GFX9-NEXT: s_add_u32 s2, s2, -1
-; GFX9-NEXT: s_addc_u32 s3, s3, -1
-; GFX9-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
+; GFX9-NEXT: s_add_u32 s0, s0, -1
+; GFX9-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
%shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
%r = urem <2 x i64> %x, %shl.y
@@ -8129,58 +8132,58 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
;
; GFX9-LABEL: sdiv_i64_oddk_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s4, 0x33fe64
-; GFX9-NEXT: s_add_u32 s4, 0x396, s4
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s0, 0x33fe64
+; GFX9-NEXT: s_add_u32 s0, 0x396, s0
; GFX9-NEXT: v_mov_b32_e32 v0, 0x28100000
-; GFX9-NEXT: s_addc_u32 s5, 0, 0
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: s_addc_u32 s1, 0, 0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_addc_u32 s4, s5, 0xd95
-; GFX9-NEXT: v_readfirstlane_b32 s6, v0
-; GFX9-NEXT: s_mul_i32 s5, s4, 0xffed2705
-; GFX9-NEXT: s_mul_hi_u32 s7, s6, 0xffed2705
-; GFX9-NEXT: s_add_i32 s7, s7, s5
-; GFX9-NEXT: s_sub_i32 s5, s7, s6
-; GFX9-NEXT: s_mul_i32 s8, s6, 0xffed2705
-; GFX9-NEXT: s_mul_hi_u32 s11, s6, s5
-; GFX9-NEXT: s_mul_i32 s12, s6, s5
-; GFX9-NEXT: s_mul_hi_u32 s6, s6, s8
-; GFX9-NEXT: s_add_u32 s6, s6, s12
-; GFX9-NEXT: s_mul_hi_u32 s9, s4, s8
-; GFX9-NEXT: s_mul_i32 s10, s4, s8
+; GFX9-NEXT: s_addc_u32 s0, s1, 0xd95
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: s_mul_i32 s1, s0, 0xffed2705
+; GFX9-NEXT: s_mul_hi_u32 s3, s2, 0xffed2705
+; GFX9-NEXT: s_add_i32 s3, s3, s1
+; GFX9-NEXT: s_sub_i32 s1, s3, s2
+; GFX9-NEXT: s_mul_i32 s8, s2, 0xffed2705
+; GFX9-NEXT: s_mul_hi_u32 s11, s2, s1
+; GFX9-NEXT: s_mul_i32 s12, s2, s1
+; GFX9-NEXT: s_mul_hi_u32 s2, s2, s8
+; GFX9-NEXT: s_add_u32 s2, s2, s12
+; GFX9-NEXT: s_mul_hi_u32 s9, s0, s8
+; GFX9-NEXT: s_mul_i32 s10, s0, s8
; GFX9-NEXT: s_addc_u32 s8, 0, s11
-; GFX9-NEXT: s_add_u32 s6, s6, s10
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mul_hi_u32 s7, s4, s5
-; GFX9-NEXT: s_addc_u32 s6, s8, s9
-; GFX9-NEXT: s_addc_u32 s7, s7, 0
-; GFX9-NEXT: s_mul_i32 s5, s4, s5
-; GFX9-NEXT: s_add_u32 s5, s6, s5
-; GFX9-NEXT: s_addc_u32 s6, 0, s7
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v0
+; GFX9-NEXT: s_add_u32 s2, s2, s10
+; GFX9-NEXT: s_mul_hi_u32 s3, s0, s1
+; GFX9-NEXT: s_addc_u32 s2, s8, s9
+; GFX9-NEXT: s_addc_u32 s3, s3, 0
+; GFX9-NEXT: s_mul_i32 s1, s0, s1
+; GFX9-NEXT: s_add_u32 s1, s2, s1
+; GFX9-NEXT: s_addc_u32 s2, 0, s3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s1, v0
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_addc_u32 s6, s4, s6
+; GFX9-NEXT: s_addc_u32 s8, s0, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: s_add_u32 s2, s2, s4
-; GFX9-NEXT: s_mov_b32 s5, s4
-; GFX9-NEXT: s_addc_u32 s3, s3, s4
-; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX9-NEXT: s_ashr_i32 s0, s7, 31
+; GFX9-NEXT: s_add_u32 s2, s6, s0
+; GFX9-NEXT: s_mov_b32 s1, s0
+; GFX9-NEXT: s_addc_u32 s3, s7, s0
+; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1]
; GFX9-NEXT: v_readfirstlane_b32 s9, v0
-; GFX9-NEXT: s_mul_i32 s8, s2, s6
+; GFX9-NEXT: s_mul_i32 s7, s2, s8
; GFX9-NEXT: s_mul_hi_u32 s10, s2, s9
-; GFX9-NEXT: s_mul_hi_u32 s7, s2, s6
-; GFX9-NEXT: s_add_u32 s8, s10, s8
-; GFX9-NEXT: s_addc_u32 s7, 0, s7
+; GFX9-NEXT: s_mul_hi_u32 s6, s2, s8
+; GFX9-NEXT: s_add_u32 s7, s10, s7
+; GFX9-NEXT: s_addc_u32 s6, 0, s6
; GFX9-NEXT: s_mul_hi_u32 s11, s3, s9
; GFX9-NEXT: s_mul_i32 s9, s3, s9
-; GFX9-NEXT: s_add_u32 s8, s8, s9
-; GFX9-NEXT: s_mul_hi_u32 s10, s3, s6
-; GFX9-NEXT: s_addc_u32 s7, s7, s11
-; GFX9-NEXT: s_addc_u32 s8, s10, 0
-; GFX9-NEXT: s_mul_i32 s6, s3, s6
-; GFX9-NEXT: s_add_u32 s6, s7, s6
-; GFX9-NEXT: s_addc_u32 s7, 0, s8
+; GFX9-NEXT: s_add_u32 s7, s7, s9
+; GFX9-NEXT: s_mul_hi_u32 s10, s3, s8
+; GFX9-NEXT: s_addc_u32 s6, s6, s11
+; GFX9-NEXT: s_addc_u32 s7, s10, 0
+; GFX9-NEXT: s_mul_i32 s8, s3, s8
+; GFX9-NEXT: s_add_u32 s6, s6, s8
+; GFX9-NEXT: s_addc_u32 s7, 0, s7
; GFX9-NEXT: s_add_u32 s8, s6, 1
; GFX9-NEXT: s_addc_u32 s9, s7, 0
; GFX9-NEXT: s_add_u32 s10, s6, 2
@@ -8213,13 +8216,13 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: s_cselect_b32 s3, s3, s7
; GFX9-NEXT: s_cselect_b32 s2, s8, s6
-; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
-; GFX9-NEXT: s_sub_u32 s2, s2, s4
-; GFX9-NEXT: s_subb_u32 s3, s3, s4
+; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT: s_sub_u32 s2, s2, s0
+; GFX9-NEXT: s_subb_u32 s3, s3, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
%r = sdiv i64 %x, 1235195
store i64 %r, ptr addrspace(1) %out
@@ -8252,17 +8255,17 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
;
; GFX9-LABEL: sdiv_i64_pow2k_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: s_lshr_b32 s4, s4, 20
-; GFX9-NEXT: s_add_u32 s2, s2, s4
-; GFX9-NEXT: s_addc_u32 s3, s3, 0
-; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_ashr_i32 s0, s7, 31
+; GFX9-NEXT: s_lshr_b32 s0, s0, 20
+; GFX9-NEXT: s_add_u32 s0, s6, s0
+; GFX9-NEXT: s_addc_u32 s1, s7, 0
+; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
%r = sdiv i64 %x, 4096
store i64 %r, ptr addrspace(1) %out
@@ -9518,100 +9521,100 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
;
; GFX9-LABEL: srem_i64_oddk_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s4, 0x33fe64
-; GFX9-NEXT: s_add_u32 s4, 0x396, s4
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s0, 0x33fe64
+; GFX9-NEXT: s_add_u32 s0, 0x396, s0
; GFX9-NEXT: v_mov_b32_e32 v0, 0x28100000
-; GFX9-NEXT: s_addc_u32 s5, 0, 0
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: s_addc_u32 s1, 0, 0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_addc_u32 s4, s5, 0xd95
-; GFX9-NEXT: v_readfirstlane_b32 s6, v0
-; GFX9-NEXT: s_mul_i32 s5, s4, 0xffed2705
-; GFX9-NEXT: s_mul_hi_u32 s7, s6, 0xffed2705
-; GFX9-NEXT: s_add_i32 s7, s7, s5
-; GFX9-NEXT: s_sub_i32 s5, s7, s6
-; GFX9-NEXT: s_mul_i32 s8, s6, 0xffed2705
-; GFX9-NEXT: s_mul_hi_u32 s11, s6, s5
-; GFX9-NEXT: s_mul_i32 s12, s6, s5
-; GFX9-NEXT: s_mul_hi_u32 s6, s6, s8
-; GFX9-NEXT: s_add_u32 s6, s6, s12
-; GFX9-NEXT: s_mul_hi_u32 s9, s4, s8
-; GFX9-NEXT: s_mul_i32 s10, s4, s8
+; GFX9-NEXT: s_addc_u32 s0, s1, 0xd95
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: s_mul_i32 s1, s0, 0xffed2705
+; GFX9-NEXT: s_mul_hi_u32 s3, s2, 0xffed2705
+; GFX9-NEXT: s_add_i32 s3, s3, s1
+; GFX9-NEXT: s_sub_i32 s1, s3, s2
+; GFX9-NEXT: s_mul_i32 s8, s2, 0xffed2705
+; GFX9-NEXT: s_mul_hi_u32 s11, s2, s1
+; GFX9-NEXT: s_mul_i32 s12, s2, s1
+; GFX9-NEXT: s_mul_hi_u32 s2, s2, s8
+; GFX9-NEXT: s_add_u32 s2, s2, s12
+; GFX9-NEXT: s_mul_hi_u32 s9, s0, s8
+; GFX9-NEXT: s_mul_i32 s10, s0, s8
; GFX9-NEXT: s_addc_u32 s8, 0, s11
-; GFX9-NEXT: s_add_u32 s6, s6, s10
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mul_hi_u32 s7, s4, s5
-; GFX9-NEXT: s_addc_u32 s6, s8, s9
-; GFX9-NEXT: s_addc_u32 s7, s7, 0
-; GFX9-NEXT: s_mul_i32 s5, s4, s5
-; GFX9-NEXT: s_add_u32 s5, s6, s5
-; GFX9-NEXT: s_addc_u32 s6, 0, s7
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v0
+; GFX9-NEXT: s_add_u32 s2, s2, s10
+; GFX9-NEXT: s_mul_hi_u32 s3, s0, s1
+; GFX9-NEXT: s_addc_u32 s2, s8, s9
+; GFX9-NEXT: s_addc_u32 s3, s3, 0
+; GFX9-NEXT: s_mul_i32 s1, s0, s1
+; GFX9-NEXT: s_add_u32 s1, s2, s1
+; GFX9-NEXT: s_addc_u32 s2, 0, s3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s1, v0
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_addc_u32 s6, s4, s6
+; GFX9-NEXT: s_addc_u32 s8, s0, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: s_add_u32 s2, s2, s4
-; GFX9-NEXT: s_mov_b32 s5, s4
-; GFX9-NEXT: s_addc_u32 s3, s3, s4
-; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s8, v0
-; GFX9-NEXT: s_mul_i32 s7, s2, s6
-; GFX9-NEXT: s_mul_hi_u32 s9, s2, s8
-; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6
-; GFX9-NEXT: s_add_u32 s7, s9, s7
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_mul_hi_u32 s10, s3, s8
-; GFX9-NEXT: s_mul_i32 s8, s3, s8
-; GFX9-NEXT: s_add_u32 s7, s7, s8
-; GFX9-NEXT: s_mul_hi_u32 s9, s3, s6
-; GFX9-NEXT: s_addc_u32 s5, s5, s10
-; GFX9-NEXT: s_addc_u32 s7, s9, 0
-; GFX9-NEXT: s_mul_i32 s6, s3, s6
-; GFX9-NEXT: s_add_u32 s5, s5, s6
-; GFX9-NEXT: s_addc_u32 s6, 0, s7
-; GFX9-NEXT: s_mul_hi_u32 s8, s5, 0x12d8fb
-; GFX9-NEXT: s_mul_i32 s5, s5, 0x12d8fb
+; GFX9-NEXT: s_ashr_i32 s0, s7, 31
+; GFX9-NEXT: s_add_u32 s2, s6, s0
+; GFX9-NEXT: s_mov_b32 s1, s0
+; GFX9-NEXT: s_addc_u32 s3, s7, s0
+; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT: v_readfirstlane_b32 s7, v0
+; GFX9-NEXT: s_mul_i32 s6, s2, s8
+; GFX9-NEXT: s_mul_hi_u32 s9, s2, s7
+; GFX9-NEXT: s_mul_hi_u32 s1, s2, s8
+; GFX9-NEXT: s_add_u32 s6, s9, s6
+; GFX9-NEXT: s_addc_u32 s1, 0, s1
+; GFX9-NEXT: s_mul_hi_u32 s10, s3, s7
+; GFX9-NEXT: s_mul_i32 s7, s3, s7
+; GFX9-NEXT: s_add_u32 s6, s6, s7
+; GFX9-NEXT: s_mul_hi_u32 s9, s3, s8
+; GFX9-NEXT: s_addc_u32 s1, s1, s10
+; GFX9-NEXT: s_addc_u32 s6, s9, 0
+; GFX9-NEXT: s_mul_i32 s7, s3, s8
+; GFX9-NEXT: s_add_u32 s1, s1, s7
+; GFX9-NEXT: s_addc_u32 s6, 0, s6
+; GFX9-NEXT: s_mul_hi_u32 s8, s1, 0x12d8fb
+; GFX9-NEXT: s_mul_i32 s1, s1, 0x12d8fb
; GFX9-NEXT: s_mul_i32 s6, s6, 0x12d8fb
-; GFX9-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: s_add_i32 s8, s8, s6
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0
; GFX9-NEXT: s_mov_b32 s7, 0x12d8fb
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_subb_u32 s2, s3, s8
+; GFX9-NEXT: s_subb_u32 s1, s3, s8
; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s7, v0
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_subb_u32 s3, s2, 0
+; GFX9-NEXT: s_subb_u32 s2, s1, 0
; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s7, v1
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_subb_u32 s5, s3, 0
+; GFX9-NEXT: s_subb_u32 s3, s2, 0
; GFX9-NEXT: s_mov_b32 s6, 0x12d8fa
; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1
-; GFX9-NEXT: s_cmp_eq_u32 s3, 0
+; GFX9-NEXT: s_cmp_eq_u32 s2, 0
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX9-NEXT: v_mov_b32_e32 v5, s3
-; GFX9-NEXT: v_mov_b32_e32 v6, s5
+; GFX9-NEXT: v_mov_b32_e32 v5, s2
+; GFX9-NEXT: v_mov_b32_e32 v6, s3
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0
-; GFX9-NEXT: s_cmp_eq_u32 s2, 0
+; GFX9-NEXT: s_cmp_eq_u32 s1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT: v_mov_b32_e32 v5, s2
+; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
-; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0
-; GFX9-NEXT: v_xor_b32_e32 v1, s4, v3
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0
+; GFX9-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
%r = srem i64 %x, 1235195
store i64 %r, ptr addrspace(1) %out
@@ -9646,19 +9649,19 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
;
; GFX9-LABEL: srem_i64_pow2k_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: s_lshr_b32 s4, s4, 20
-; GFX9-NEXT: s_add_u32 s4, s2, s4
-; GFX9-NEXT: s_addc_u32 s5, s3, 0
-; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000
-; GFX9-NEXT: s_sub_u32 s2, s2, s4
-; GFX9-NEXT: s_subb_u32 s3, s3, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_ashr_i32 s0, s7, 31
+; GFX9-NEXT: s_lshr_b32 s0, s0, 20
+; GFX9-NEXT: s_add_u32 s0, s6, s0
+; GFX9-NEXT: s_addc_u32 s1, s7, 0
+; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000
+; GFX9-NEXT: s_sub_u32 s0, s6, s0
+; GFX9-NEXT: s_subb_u32 s1, s7, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
%r = srem i64 %x, 4096
store i64 %r, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll
index d613759..c623364 100644
--- a/llvm/test/CodeGen/AMDGPU/and.ll
+++ b/llvm/test/CodeGen/AMDGPU/and.ll
@@ -227,10 +227,10 @@ define amdgpu_kernel void @s_and_32_bit_constant_i64(ptr addrspace(1) %out, i32,
; SI: s_load_dword [[B:s[0-9]+]]
; SI: s_load_dwordx2
; SI-NOT: and
-; SI: s_lshl_b32 [[A]], [[A]], 1
-; SI: s_lshl_b32 [[B]], [[B]], 1
-; SI: s_and_b32 s{{[0-9]+}}, [[A]], 62
-; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62
+; SI: s_lshl_b32 [[C:s[0-9]+]], [[A]], 1
+; SI: s_lshl_b32 [[D:s[0-9]+]], [[B]], 1
+; SI: s_and_b32 s{{[0-9]+}}, [[C]], 62
+; SI: s_and_b32 s{{[0-9]+}}, [[D]], 62
; SI-NOT: and
; SI: buffer_store_dwordx2
define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, i32, i64 %a, i32, i64 %b, i32, i64 %c) {
@@ -371,9 +371,9 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad
; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64_noshrink:
; SI: s_load_dword [[A:s[0-9]+]]
-; SI: s_lshl_b32 [[A]], [[A]], 1{{$}}
+; SI: s_lshl_b32 [[B:s[0-9]+]], [[A]], 1{{$}}
; SI-NOT: and
-; SI: s_and_b32 s{{[0-9]+}}, [[A]], 64
+; SI: s_and_b32 s{{[0-9]+}}, [[B]], 64
; SI-NOT: and
; SI: s_add_u32
; SI-NEXT: s_addc_u32
diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll
index 897e134..4617a53 100644
--- a/llvm/test/CodeGen/AMDGPU/anyext.ll
+++ b/llvm/test/CodeGen/AMDGPU/anyext.ll
@@ -22,17 +22,17 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 {
;
; GFX8-LABEL: anyext_i1_i32:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_cmp_eq_u32 s4, 0
-; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8-NEXT: s_cmp_eq_u32 s2, 0
+; GFX8-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: anyext_i1_i32:
@@ -89,15 +89,15 @@ define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace
; GFX8-LABEL: s_anyext_i16_i32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, s7
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v0
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v1
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v2, v[2:3]
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 624101d..220fa5a 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -63,13 +63,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB0_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -91,13 +91,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB0_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_constant:
@@ -119,13 +119,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: .LBB0_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_constant:
@@ -146,13 +147,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: .LBB0_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_constant:
@@ -175,14 +177,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB0_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -206,14 +208,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB0_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -238,14 +240,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB0_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -269,14 +271,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB0_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -336,14 +338,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB1_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -366,14 +368,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB1_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_uniform:
@@ -396,13 +398,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W64-NEXT: .LBB1_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3]
+; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1]
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_uniform:
@@ -424,13 +427,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W32-NEXT: .LBB1_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5]
+; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_uniform:
@@ -454,14 +458,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB1_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
+; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1]
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -486,14 +490,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX11W32-NEXT: .LBB1_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
+; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[0:1]
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -519,14 +523,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB1_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
+; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[0:1]
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -551,14 +555,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB1_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5]
+; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[0:1]
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -612,13 +616,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB2_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -653,13 +657,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB2_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v1
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_varying_vdata:
@@ -693,13 +697,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: .LBB2_4:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W64-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_varying_vdata:
@@ -732,13 +737,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: .LBB2_4:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W32-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_varying_vdata:
@@ -774,14 +780,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB2_4:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -817,13 +823,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB2_4:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
+; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -861,14 +867,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB2_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -904,13 +910,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB2_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
+; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -969,13 +975,13 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB3_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1012,13 +1018,13 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB3_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v1
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: struct_add_i32_varying_vdata:
@@ -1055,13 +1061,14 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX10W64-NEXT: .LBB3_4:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W64-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: struct_add_i32_varying_vdata:
@@ -1097,13 +1104,14 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX10W32-NEXT: .LBB3_4:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W32-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: struct_add_i32_varying_vdata:
@@ -1142,14 +1150,14 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB3_4:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1188,13 +1196,13 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], 0 idxen glc
; GFX11W32-NEXT: .LBB3_4:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
+; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1235,14 +1243,14 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB3_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -1281,13 +1289,13 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB3_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
+; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1315,12 +1323,12 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX8-LABEL: add_i32_varying_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 offen glc
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -1328,51 +1336,54 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX9-LABEL: add_i32_varying_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: add_i32_varying_offset:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: add_i32_varying_offset:
; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: add_i32_varying_offset:
; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 1
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1431,14 +1442,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB5_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1460,14 +1471,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB5_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_constant:
@@ -1489,14 +1500,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: .LBB5_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_constant:
@@ -1517,14 +1529,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: .LBB5_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_constant:
@@ -1547,15 +1560,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB5_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1579,15 +1592,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB5_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1612,15 +1625,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB5_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -1644,15 +1657,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB5_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1712,14 +1725,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB6_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1742,14 +1755,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB6_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_uniform:
@@ -1772,14 +1785,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W64-NEXT: .LBB6_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_uniform:
@@ -1801,14 +1814,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W32-NEXT: .LBB6_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_uniform:
@@ -1832,15 +1845,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB6_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1865,15 +1878,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX11W32-NEXT: .LBB6_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1899,15 +1912,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB6_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -1932,15 +1945,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB6_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1994,13 +2007,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB7_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2035,13 +2048,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB7_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v1
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_varying_vdata:
@@ -2075,13 +2088,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: .LBB7_4:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W64-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_varying_vdata:
@@ -2114,13 +2128,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: .LBB7_4:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W32-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_varying_vdata:
@@ -2156,14 +2171,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB7_4:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -2199,14 +2214,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB7_4:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -2244,14 +2259,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB7_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -2287,14 +2302,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB7_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W32-NEXT: v_mov_b32_e32 v0, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -2322,12 +2337,12 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX8-LABEL: sub_i32_varying_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 offen glc
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -2335,51 +2350,54 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX9-LABEL: sub_i32_varying_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sub_i32_varying_offset:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: sub_i32_varying_offset:
; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: sub_i32_varying_offset:
; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 1
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 8ee0ee3..529af3d 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -48,243 +48,243 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
;
; GFX89-LABEL: add_i32_constant:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b64 s[6:7], exec
-; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b64 s[2:3], exec
+; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX89-NEXT: ; implicit-def: $vgpr1
-; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX89-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX89-NEXT: s_cbranch_execz .LBB0_2
; GFX89-NEXT: ; %bb.1:
-; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX89-NEXT: s_mul_i32 s2, s2, 5
; GFX89-NEXT: s_mov_b32 s11, 0xf000
; GFX89-NEXT: s_mov_b32 s10, -1
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: v_mov_b32_e32 v1, s2
; GFX89-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: buffer_wbinvl1_vol
; GFX89-NEXT: .LBB0_2:
-; GFX89-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX89-NEXT: v_readfirstlane_b32 s4, v1
+; GFX89-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX89-NEXT: v_readfirstlane_b32 s0, v1
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s3, 0xf000
-; GFX89-NEXT: s_mov_b32 s2, -1
-; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s4
-; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX89-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i32_constant:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT: s_mov_b64 s[6:7], exec
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB0_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT: s_mul_i32 s6, s6, 5
+; GFX1064-NEXT: s_mul_i32 s2, s2, 5
; GFX1064-NEXT: s_mov_b32 s10, -1
-; GFX1064-NEXT: v_mov_b32_e32 v1, s6
+; GFX1064-NEXT: v_mov_b32_e32 v1, s2
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s8, s2
-; GFX1064-NEXT: s_mov_b32 s9, s3
+; GFX1064-NEXT: s_mov_b32 s8, s6
+; GFX1064-NEXT: s_mov_b32 s9, s7
; GFX1064-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB0_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i32_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB0_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT: s_mul_i32 s5, s5, 5
+; GFX1032-NEXT: s_mul_i32 s1, s1, 5
; GFX1032-NEXT: s_mov_b32 s10, -1
-; GFX1032-NEXT: v_mov_b32_e32 v1, s5
+; GFX1032-NEXT: v_mov_b32_e32 v1, s1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mov_b32 s8, s2
-; GFX1032-NEXT: s_mov_b32 s9, s3
+; GFX1032-NEXT: s_mov_b32 s8, s6
+; GFX1032-NEXT: s_mov_b32 s9, s7
; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB0_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i32_constant:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB0_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1164-NEXT: s_mul_i32 s6, s6, 5
+; GFX1164-NEXT: s_mul_i32 s2, s2, 5
; GFX1164-NEXT: s_mov_b32 s10, -1
-; GFX1164-NEXT: v_mov_b32_e32 v1, s6
+; GFX1164-NEXT: v_mov_b32_e32 v1, s2
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s8, s2
-; GFX1164-NEXT: s_mov_b32 s9, s3
+; GFX1164-NEXT: s_mov_b32 s8, s6
+; GFX1164-NEXT: s_mov_b32 s9, s7
; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB0_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: add_i32_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT: s_mov_b32 s5, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB0_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1132-NEXT: s_mul_i32 s5, s5, 5
+; GFX1132-NEXT: s_mul_i32 s1, s1, 5
; GFX1132-NEXT: s_mov_b32 s10, -1
-; GFX1132-NEXT: v_mov_b32_e32 v1, s5
+; GFX1132-NEXT: v_mov_b32_e32 v1, s1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mov_b32 s8, s2
-; GFX1132-NEXT: s_mov_b32 s9, s3
+; GFX1132-NEXT: s_mov_b32 s8, s6
+; GFX1132-NEXT: s_mov_b32 s9, s7
; GFX1132-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB0_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX1264-LABEL: add_i32_constant:
; GFX1264: ; %bb.0: ; %entry
-; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1264-NEXT: s_mov_b64 s[6:7], exec
-; GFX1264-NEXT: s_mov_b64 s[4:5], exec
-; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1264-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264-NEXT: s_mov_b64 s[0:1], exec
+; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1264-NEXT: s_cbranch_execz .LBB0_2
; GFX1264-NEXT: ; %bb.1:
-; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1264-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT: s_mul_i32 s6, s6, 5
+; GFX1264-NEXT: s_mul_i32 s2, s2, 5
; GFX1264-NEXT: s_mov_b32 s10, -1
-; GFX1264-NEXT: v_mov_b32_e32 v1, s6
+; GFX1264-NEXT: v_mov_b32_e32 v1, s2
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mov_b32 s8, s2
-; GFX1264-NEXT: s_mov_b32 s9, s3
+; GFX1264-NEXT: s_mov_b32 s8, s6
+; GFX1264-NEXT: s_mov_b32 s9, s7
; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB0_2:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1264-NEXT: v_readfirstlane_b32 s0, v1
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s6, -1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1264-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1264-NEXT: s_nop 0
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1264-NEXT: s_endpgm
;
; GFX1232-LABEL: add_i32_constant:
; GFX1232: ; %bb.0: ; %entry
-; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1232-NEXT: s_mov_b32 s5, exec_lo
-; GFX1232-NEXT: s_mov_b32 s4, exec_lo
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1232-NEXT: s_mov_b32 s2, exec_lo
+; GFX1232-NEXT: s_mov_b32 s0, exec_lo
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1232-NEXT: ; implicit-def: $vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1232-NEXT: s_cbranch_execz .LBB0_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1232-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT: s_mul_i32 s5, s5, 5
+; GFX1232-NEXT: s_mul_i32 s1, s1, 5
; GFX1232-NEXT: s_mov_b32 s10, -1
-; GFX1232-NEXT: v_mov_b32_e32 v1, s5
+; GFX1232-NEXT: v_mov_b32_e32 v1, s1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mov_b32 s8, s2
-; GFX1232-NEXT: s_mov_b32 s9, s3
+; GFX1232-NEXT: s_mov_b32 s8, s6
+; GFX1232-NEXT: s_mov_b32 s9, s7
; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB0_2:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1232-NEXT: v_readfirstlane_b32 s0, v1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s6, -1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1232-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1232-NEXT: s_endpgm
@@ -647,280 +647,280 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8-LABEL: add_i32_varying:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_mov_b64 s[2:3], exec
-; GFX8-NEXT: s_mov_b32 s6, 0
+; GFX8-NEXT: s_mov_b32 s8, 0
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB2_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s4, s[2:3]
; GFX8-NEXT: s_mov_b32 m0, s4
-; GFX8-NEXT: v_readlane_b32 s7, v0, s4
+; GFX8-NEXT: v_readlane_b32 s6, v0, s4
; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4
-; GFX8-NEXT: v_writelane_b32 v1, s6, m0
-; GFX8-NEXT: s_add_i32 s6, s6, s7
+; GFX8-NEXT: v_writelane_b32 v1, s8, m0
+; GFX8-NEXT: s_add_i32 s8, s8, s6
; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX8-NEXT: s_cbranch_execz .LBB2_4
; GFX8-NEXT: ; %bb.3:
-; GFX8-NEXT: s_mov_b32 s11, 0xf000
-; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s15, 0xf000
+; GFX8-NEXT: s_mov_b32 s14, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX8-NEXT: s_mov_b32 s12, s6
+; GFX8-NEXT: s_mov_b32 s13, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NEXT: buffer_atomic_add v0, off, s[12:15], 0 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB2_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v1
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v1
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: s_mov_b32 s6, 0
+; GFX9-NEXT: s_mov_b32 s8, 0
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB2_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s4, s[2:3]
; GFX9-NEXT: s_mov_b32 m0, s4
-; GFX9-NEXT: v_readlane_b32 s7, v0, s4
+; GFX9-NEXT: v_readlane_b32 s6, v0, s4
; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4
-; GFX9-NEXT: v_writelane_b32 v1, s6, m0
-; GFX9-NEXT: s_add_i32 s6, s6, s7
+; GFX9-NEXT: v_writelane_b32 v1, s8, m0
+; GFX9-NEXT: s_add_i32 s8, s8, s6
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX9-NEXT: s_cbranch_execz .LBB2_4
; GFX9-NEXT: ; %bb.3:
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s15, 0xf000
+; GFX9-NEXT: s_mov_b32 s14, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: buffer_atomic_add v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: .LBB2_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_add_u32_e32 v0, s4, v1
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v1
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i32_varying:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: s_mov_b32 s6, 0
+; GFX1064-NEXT: s_mov_b32 s8, 0
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s7, s[2:3]
-; GFX1064-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX1064-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1064-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX1064-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s6
+; GFX1064-NEXT: v_writelane_b32 v1, s8, s6
; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX1064-NEXT: s_add_i32 s6, s6, s8
+; GFX1064-NEXT: s_add_i32 s8, s8, s7
; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execz .LBB2_4
; GFX1064-NEXT: ; %bb.3:
-; GFX1064-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064-NEXT: v_mov_b32_e32 v0, s8
; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
; GFX1064-NEXT: s_mov_b32 s10, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s8, s2
-; GFX1064-NEXT: s_mov_b32 s9, s3
+; GFX1064-NEXT: s_mov_b32 s8, s6
+; GFX1064-NEXT: s_mov_b32 s9, s7
; GFX1064-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB2_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v1
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i32_varying:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s3, s2
-; GFX1032-NEXT: v_readlane_b32 s5, v0, s3
-; GFX1032-NEXT: s_lshl_b32 s6, 1, s3
-; GFX1032-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1032-NEXT: s_andn2_b32 s2, s2, s6
-; GFX1032-NEXT: s_add_i32 s4, s4, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1032-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1032-NEXT: s_andn2_b32 s3, s3, s6
+; GFX1032-NEXT: s_add_i32 s2, s2, s5
+; GFX1032-NEXT: s_cmp_lg_u32 s3, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s5, exec_lo, s5
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1032-NEXT: s_cbranch_execz .LBB2_4
; GFX1032-NEXT: ; %bb.3:
-; GFX1032-NEXT: v_mov_b32_e32 v0, s4
+; GFX1032-NEXT: v_mov_b32_e32 v0, s2
; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mov_b32 s8, s2
-; GFX1032-NEXT: s_mov_b32 s9, s3
+; GFX1032-NEXT: s_mov_b32 s8, s6
+; GFX1032-NEXT: s_mov_b32 s9, s7
; GFX1032-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB2_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v1
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i32_varying:
; GFX1164: ; %bb.0: ; %entry
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: s_mov_b32 s6, 0
+; GFX1164-NEXT: s_mov_b32 s8, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164-NEXT: s_ctz_i32_b64 s6, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX1164-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1164-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s6
+; GFX1164-NEXT: v_writelane_b32 v1, s8, s6
; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1164-NEXT: s_add_i32 s6, s6, s8
+; GFX1164-NEXT: s_add_i32 s8, s8, s7
; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execz .LBB2_4
; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164-NEXT: v_mov_b32_e32 v0, s8
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
; GFX1164-NEXT: s_mov_b32 s10, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s8, s2
-; GFX1164-NEXT: s_mov_b32 s9, s3
+; GFX1164-NEXT: s_mov_b32 s8, s6
+; GFX1164-NEXT: s_mov_b32 s9, s7
; GFX1164-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB2_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: v_add_nc_u32_e32 v0, s0, v1
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: add_i32_varying:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
; GFX1132-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132-NEXT: s_ctz_i32_b32 s4, s3
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s3
-; GFX1132-NEXT: s_lshl_b32 s6, 1, s3
-; GFX1132-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1132-NEXT: s_and_not1_b32 s2, s2, s6
-; GFX1132-NEXT: s_add_i32 s4, s4, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1132-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6
+; GFX1132-NEXT: s_add_i32 s2, s2, s5
+; GFX1132-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5
+; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1132-NEXT: s_cbranch_execz .LBB2_4
; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, s4
+; GFX1132-NEXT: v_mov_b32_e32 v0, s2
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
; GFX1132-NEXT: s_mov_b32 s10, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mov_b32 s8, s2
-; GFX1132-NEXT: s_mov_b32 s9, s3
+; GFX1132-NEXT: s_mov_b32 s8, s6
+; GFX1132-NEXT: s_mov_b32 s9, s7
; GFX1132-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB2_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: v_add_nc_u32_e32 v0, s0, v1
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -928,97 +928,97 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-LABEL: add_i32_varying:
; GFX1264: ; %bb.0: ; %entry
; GFX1264-NEXT: s_mov_b64 s[2:3], exec
-; GFX1264-NEXT: s_mov_b32 s6, 0
+; GFX1264-NEXT: s_mov_b32 s8, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
; GFX1264-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1264-NEXT: s_ctz_i32_b64 s6, s[2:3]
; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1264-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1264-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX1264-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1264-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1264-NEXT: s_lshl_b64 s[4:5], 1, s6
+; GFX1264-NEXT: v_writelane_b32 v1, s8, s6
; GFX1264-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1264-NEXT: s_add_co_i32 s6, s6, s8
+; GFX1264-NEXT: s_add_co_i32 s8, s8, s7
; GFX1264-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1264-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1264-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0
-; GFX1264-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1264-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1264-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1264-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1264-NEXT: s_cbranch_execz .LBB2_4
; GFX1264-NEXT: ; %bb.3:
-; GFX1264-NEXT: v_mov_b32_e32 v0, s6
+; GFX1264-NEXT: v_mov_b32_e32 v0, s8
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264-NEXT: s_mov_b32 s10, -1
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mov_b32 s8, s2
-; GFX1264-NEXT: s_mov_b32 s9, s3
+; GFX1264-NEXT: s_mov_b32 s8, s6
+; GFX1264-NEXT: s_mov_b32 s9, s7
; GFX1264-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB2_4:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1264-NEXT: v_readfirstlane_b32 s0, v0
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s6, -1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1264-NEXT: v_add_nc_u32_e32 v0, s0, v1
+; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1264-NEXT: s_nop 0
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1264-NEXT: s_endpgm
;
; GFX1232-LABEL: add_i32_varying:
; GFX1232: ; %bb.0: ; %entry
-; GFX1232-NEXT: s_mov_b32 s2, exec_lo
-; GFX1232-NEXT: s_mov_b32 s4, 0
+; GFX1232-NEXT: s_mov_b32 s3, exec_lo
+; GFX1232-NEXT: s_mov_b32 s2, 0
; GFX1232-NEXT: ; implicit-def: $vgpr1
; GFX1232-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1232-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1232-NEXT: s_ctz_i32_b32 s4, s3
; GFX1232-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1232-NEXT: v_readlane_b32 s5, v0, s3
-; GFX1232-NEXT: s_lshl_b32 s6, 1, s3
-; GFX1232-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1232-NEXT: s_and_not1_b32 s2, s2, s6
-; GFX1232-NEXT: s_add_co_i32 s4, s4, s5
-; GFX1232-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1232-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1232-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1232-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1232-NEXT: s_and_not1_b32 s3, s3, s6
+; GFX1232-NEXT: s_add_co_i32 s2, s2, s5
+; GFX1232-NEXT: s_cmp_lg_u32 s3, 0
; GFX1232-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232-NEXT: ; implicit-def: $vgpr0
-; GFX1232-NEXT: s_and_saveexec_b32 s5, vcc_lo
-; GFX1232-NEXT: s_xor_b32 s5, exec_lo, s5
+; GFX1232-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1232-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1232-NEXT: s_cbranch_execz .LBB2_4
; GFX1232-NEXT: ; %bb.3:
-; GFX1232-NEXT: v_mov_b32_e32 v0, s4
+; GFX1232-NEXT: v_mov_b32_e32 v0, s2
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
; GFX1232-NEXT: s_mov_b32 s10, -1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mov_b32 s8, s2
-; GFX1232-NEXT: s_mov_b32 s9, s3
+; GFX1232-NEXT: s_mov_b32 s8, s6
+; GFX1232-NEXT: s_mov_b32 s9, s7
; GFX1232-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB2_4:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1232-NEXT: v_readfirstlane_b32 s0, v0
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s6, -1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1232-NEXT: v_add_nc_u32_e32 v0, s0, v1
+; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1232-NEXT: s_endpgm
@@ -1071,260 +1071,259 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
;
; GFX89-LABEL: add_i64_constant:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b64 s[6:7], exec
-; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX89-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b64 s[2:3], exec
+; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX89-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX89-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX89-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX89-NEXT: s_cbranch_execz .LBB3_2
; GFX89-NEXT: ; %bb.1:
-; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX89-NEXT: s_mul_i32 s2, s2, 5
; GFX89-NEXT: s_mov_b32 s11, 0xf000
; GFX89-NEXT: s_mov_b32 s10, -1
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: v_mov_b32_e32 v0, s2
; GFX89-NEXT: v_mov_b32_e32 v1, 0
; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: buffer_wbinvl1_vol
; GFX89-NEXT: .LBB3_2:
-; GFX89-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX89-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX89-NEXT: v_readfirstlane_b32 s0, v0
+; GFX89-NEXT: v_readfirstlane_b32 s1, v1
+; GFX89-NEXT: v_mov_b32_e32 v0, s0
+; GFX89-NEXT: v_mov_b32_e32 v1, s1
+; GFX89-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 5, v[0:1]
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: v_readfirstlane_b32 s2, v0
-; GFX89-NEXT: v_readfirstlane_b32 s3, v1
-; GFX89-NEXT: v_mov_b32_e32 v0, s2
-; GFX89-NEXT: v_mov_b32_e32 v1, s3
-; GFX89-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
-; GFX89-NEXT: s_mov_b32 s3, 0xf000
-; GFX89-NEXT: s_mov_b32 s2, -1
-; GFX89-NEXT: s_nop 2
-; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX89-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i64_constant:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB3_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: s_mul_i32 s6, s6, 5
+; GFX1064-NEXT: s_mul_i32 s2, s2, 5
; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064-NEXT: v_mov_b32_e32 v0, s2
; GFX1064-NEXT: s_mov_b32 s10, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s8, s2
-; GFX1064-NEXT: s_mov_b32 s9, s3
+; GFX1064-NEXT: s_mov_b32 s8, s6
+; GFX1064-NEXT: s_mov_b32 s9, s7
; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB3_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 5, s[0:1]
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i64_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB3_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: s_mul_i32 s5, s5, 5
+; GFX1032-NEXT: s_mul_i32 s1, s1, 5
; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT: v_mov_b32_e32 v0, s5
+; GFX1032-NEXT: v_mov_b32_e32 v0, s1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mov_b32 s8, s2
-; GFX1032-NEXT: s_mov_b32 s9, s3
+; GFX1032-NEXT: s_mov_b32 s8, s6
+; GFX1032-NEXT: s_mov_b32 s9, s7
; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB3_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, v2, 5, s[0:1]
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i64_constant:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1164-NEXT: s_cbranch_execz .LBB3_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: s_mul_i32 s6, s6, 5
+; GFX1164-NEXT: s_mul_i32 s2, s2, 5
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1164-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164-NEXT: v_mov_b32_e32 v0, s2
; GFX1164-NEXT: s_mov_b32 s10, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s8, s2
-; GFX1164-NEXT: s_mov_b32 s9, s3
+; GFX1164-NEXT: s_mov_b32 s8, s6
+; GFX1164-NEXT: s_mov_b32 s9, s7
; GFX1164-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB3_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[0:1]
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: add_i64_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT: s_mov_b32 s5, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB3_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1132-NEXT: s_mul_i32 s5, s5, 5
+; GFX1132-NEXT: s_mul_i32 s1, s1, 5
; GFX1132-NEXT: s_mov_b32 s10, -1
-; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mov_b32 s8, s2
-; GFX1132-NEXT: s_mov_b32 s9, s3
+; GFX1132-NEXT: s_mov_b32 s8, s6
+; GFX1132-NEXT: s_mov_b32 s9, s7
; GFX1132-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB3_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[0:1]
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX1264-LABEL: add_i64_constant:
; GFX1264: ; %bb.0: ; %entry
-; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1264-NEXT: s_mov_b64 s[6:7], exec
+; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1264-NEXT: s_mov_b64 s[2:3], exec
; GFX1264-NEXT: s_mov_b32 s9, 0
-; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1264-NEXT: s_mov_b64 s[4:5], exec
+; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1264-NEXT: s_mov_b64 s[0:1], exec
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1264-NEXT: s_cbranch_execz .LBB3_2
; GFX1264-NEXT: ; %bb.1:
-; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7]
+; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[2:3]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5
+; GFX1264-NEXT: s_mul_u64 s[2:3], s[8:9], 5
; GFX1264-NEXT: s_mov_b32 s10, -1
-; GFX1264-NEXT: v_mov_b32_e32 v0, s6
-; GFX1264-NEXT: v_mov_b32_e32 v1, s7
+; GFX1264-NEXT: v_mov_b32_e32 v0, s2
+; GFX1264-NEXT: v_mov_b32_e32 v1, s3
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mov_b32 s8, s2
-; GFX1264-NEXT: s_mov_b32 s9, s3
+; GFX1264-NEXT: s_mov_b32 s8, s6
+; GFX1264-NEXT: s_mov_b32 s9, s7
; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB3_2:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1264-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1264-NEXT: v_readfirstlane_b32 s1, v1
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1264-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s6, -1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, 5, s[2:3]
-; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, 5, s[0:1]
+; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX1264-NEXT: s_nop 0
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1264-NEXT: s_endpgm
;
; GFX1232-LABEL: add_i64_constant:
; GFX1232: ; %bb.0: ; %entry
-; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1232-NEXT: s_mov_b32 s4, exec_lo
-; GFX1232-NEXT: s_mov_b32 s5, 0
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0
-; GFX1232-NEXT: s_mov_b32 s6, exec_lo
+; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1232-NEXT: s_mov_b32 s3, exec_lo
+; GFX1232-NEXT: s_mov_b32 s1, 0
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
+; GFX1232-NEXT: s_mov_b32 s2, exec_lo
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB3_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1232-NEXT: s_bcnt1_i32_b32 s0, s3
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5
+; GFX1232-NEXT: s_mul_u64 s[0:1], s[0:1], 5
; GFX1232-NEXT: s_mov_b32 s10, -1
-; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1232-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mov_b32 s8, s2
-; GFX1232-NEXT: s_mov_b32 s9, s3
+; GFX1232-NEXT: s_mov_b32 s8, s6
+; GFX1232-NEXT: s_mov_b32 s9, s7
; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB3_2:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1232-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1232-NEXT: v_readfirstlane_b32 s1, v1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s6, -1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, 5, s[2:3]
-; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, 5, s[0:1]
+; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1232-NEXT: s_endpgm
@@ -1383,21 +1382,21 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-LABEL: add_i64_uniform:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: s_mov_b64 s[8:9], exec
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB4_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s12, s6
; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9]
; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0
-; GFX8-NEXT: s_mul_i32 s6, s1, s6
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s2, v0, 0
+; GFX8-NEXT: s_mul_i32 s6, s3, s6
; GFX8-NEXT: s_mov_b32 s15, 0xf000
; GFX8-NEXT: s_mov_b32 s14, -1
; GFX8-NEXT: s_mov_b32 s13, s7
@@ -1406,14 +1405,14 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB4_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_readfirstlane_b32 s3, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_lo_u32 v3, s1, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s0, v2, v[0:1]
+; GFX8-NEXT: v_mul_lo_u32 v3, s3, v2
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, v[0:1]
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
@@ -1548,9 +1547,9 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164: ; %bb.0: ; %entry
; GFX1164-NEXT: s_clause 0x1
; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1164-NEXT: s_mov_b64 s[8:9], exec
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
@@ -1561,9 +1560,9 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: s_bcnt1_i32_b64 s8, s[8:9]
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mul_i32 s9, s1, s8
-; GFX1164-NEXT: s_mul_hi_u32 s10, s0, s8
-; GFX1164-NEXT: s_mul_i32 s8, s0, s8
+; GFX1164-NEXT: s_mul_i32 s9, s3, s8
+; GFX1164-NEXT: s_mul_hi_u32 s10, s2, s8
+; GFX1164-NEXT: s_mul_i32 s8, s2, s8
; GFX1164-NEXT: s_add_i32 s10, s10, s9
; GFX1164-NEXT: v_mov_b32_e32 v0, s8
; GFX1164-NEXT: v_mov_b32_e32 v1, s10
@@ -1575,15 +1574,15 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB4_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s0, v2, s[2:3]
-; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s1, v2, v[1:2]
+; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[0:1]
+; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v1, v3
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
@@ -1595,24 +1594,24 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132: ; %bb.0: ; %entry
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX1132-NEXT: s_mov_b32 s8, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB4_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s8
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mul_i32 s8, s1, s3
-; GFX1132-NEXT: s_mul_hi_u32 s9, s0, s3
-; GFX1132-NEXT: s_mul_i32 s3, s0, s3
+; GFX1132-NEXT: s_mul_i32 s8, s3, s1
+; GFX1132-NEXT: s_mul_hi_u32 s9, s2, s1
+; GFX1132-NEXT: s_mul_i32 s1, s2, s1
; GFX1132-NEXT: s_add_i32 s9, s9, s8
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s9
+; GFX1132-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s9
; GFX1132-NEXT: s_mov_b32 s10, -1
; GFX1132-NEXT: s_mov_b32 s8, s6
; GFX1132-NEXT: s_mov_b32 s9, s7
@@ -1621,15 +1620,15 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB4_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s0, v2, s[2:3]
-; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s1, v2, v[1:2]
+; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[0:1]
+; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v1, v3
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
@@ -1641,11 +1640,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264: ; %bb.0: ; %entry
; GFX1264-NEXT: s_clause 0x1
; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1264-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1264-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1264-NEXT: s_mov_b64 s[8:9], exec
; GFX1264-NEXT: s_mov_b32 s11, 0
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1264-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264-NEXT: s_mov_b64 s[0:1], exec
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -1654,7 +1653,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: ; %bb.1:
; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9]
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mul_u64 s[8:9], s[0:1], s[10:11]
+; GFX1264-NEXT: s_mul_u64 s[8:9], s[2:3], s[10:11]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264-NEXT: v_mov_b32_e32 v0, s8
; GFX1264-NEXT: v_mov_b32_e32 v1, s9
@@ -1665,15 +1664,15 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB4_2:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1264-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1264-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1264-NEXT: v_readfirstlane_b32 s1, v1
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
; GFX1264-NEXT: s_mov_b32 s6, -1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v2, s[2:3]
-; GFX1264-NEXT: v_mad_co_u64_u32 v[1:2], null, s1, v2, v[1:2]
+; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v2, s[0:1]
+; GFX1264-NEXT: v_mad_co_u64_u32 v[1:2], null, s3, v2, v[1:2]
; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX1264-NEXT: s_nop 0
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1683,22 +1682,22 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232: ; %bb.0: ; %entry
; GFX1232-NEXT: s_clause 0x1
; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1232-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX1232-NEXT: s_mov_b32 s2, exec_lo
-; GFX1232-NEXT: s_mov_b32 s3, 0
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
+; GFX1232-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX1232-NEXT: s_mov_b32 s9, exec_lo
+; GFX1232-NEXT: s_mov_b32 s1, 0
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0
; GFX1232-NEXT: s_mov_b32 s8, exec_lo
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB4_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1232-NEXT: s_bcnt1_i32_b32 s0, s9
; GFX1232-NEXT: s_mov_b32 s15, 0x31016000
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mul_u64 s[2:3], s[0:1], s[2:3]
+; GFX1232-NEXT: s_mul_u64 s[0:1], s[2:3], s[0:1]
; GFX1232-NEXT: s_mov_b32 s14, -1
-; GFX1232-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1232-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX1232-NEXT: s_mov_b32 s12, s6
; GFX1232-NEXT: s_mov_b32 s13, s7
; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN
@@ -1706,14 +1705,14 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB4_2:
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1232-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1232-NEXT: v_readfirstlane_b32 s1, v1
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
; GFX1232-NEXT: s_mov_b32 s6, -1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v2, s[2:3]
-; GFX1232-NEXT: v_mad_co_u64_u32 v[1:2], null, s1, v2, v[1:2]
+; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v2, s[0:1]
+; GFX1232-NEXT: v_mad_co_u64_u32 v[1:2], null, s3, v2, v[1:2]
; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1746,82 +1745,82 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX89-LABEL: add_i64_varying:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: v_mov_b32_e32 v1, 0
; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: buffer_wbinvl1_vol
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
-; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
+; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX10-LABEL: add_i64_varying:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s11, s7
-; GFX10-NEXT: s_mov_b32 s10, s6
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s11, s3
+; GFX10-NEXT: s_mov_b32 s10, s2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_mov_b32 s5, s1
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT: s_mov_b32 s1, s5
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: add_i64_varying:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_mov_b32 s10, s6
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: s_mov_b32 s1, s5
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: add_i64_varying:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s11, s7
-; GFX12-NEXT: s_mov_b32 s10, s6
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s11, s3
+; GFX12-NEXT: s_mov_b32 s10, s2
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
+; GFX12-NEXT: s_mov_b32 s0, s4
; GFX12-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_mov_b32 s5, s1
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: s_mov_b32 s1, s5
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1869,283 +1868,283 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
;
; GFX8-LABEL: sub_i32_constant:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB6_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX8-NEXT: s_mul_i32 s2, s2, 5
; GFX8-NEXT: s_mov_b32 s11, 0xf000
; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: v_mov_b32_e32 v1, s2
; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB6_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_constant:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB6_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX9-NEXT: s_mul_i32 s2, s2, 5
; GFX9-NEXT: s_mov_b32 s11, 0xf000
; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: .LBB6_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i32_constant:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT: s_mov_b64 s[6:7], exec
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB6_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT: s_mul_i32 s6, s6, 5
+; GFX1064-NEXT: s_mul_i32 s2, s2, 5
; GFX1064-NEXT: s_mov_b32 s10, -1
-; GFX1064-NEXT: v_mov_b32_e32 v1, s6
+; GFX1064-NEXT: v_mov_b32_e32 v1, s2
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s8, s2
-; GFX1064-NEXT: s_mov_b32 s9, s3
+; GFX1064-NEXT: s_mov_b32 s8, s6
+; GFX1064-NEXT: s_mov_b32 s9, s7
; GFX1064-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB6_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i32_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB6_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT: s_mul_i32 s5, s5, 5
+; GFX1032-NEXT: s_mul_i32 s1, s1, 5
; GFX1032-NEXT: s_mov_b32 s10, -1
-; GFX1032-NEXT: v_mov_b32_e32 v1, s5
+; GFX1032-NEXT: v_mov_b32_e32 v1, s1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mov_b32 s8, s2
-; GFX1032-NEXT: s_mov_b32 s9, s3
+; GFX1032-NEXT: s_mov_b32 s8, s6
+; GFX1032-NEXT: s_mov_b32 s9, s7
; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB6_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i32_constant:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB6_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1164-NEXT: s_mul_i32 s6, s6, 5
+; GFX1164-NEXT: s_mul_i32 s2, s2, 5
; GFX1164-NEXT: s_mov_b32 s10, -1
-; GFX1164-NEXT: v_mov_b32_e32 v1, s6
+; GFX1164-NEXT: v_mov_b32_e32 v1, s2
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s8, s2
-; GFX1164-NEXT: s_mov_b32 s9, s3
+; GFX1164-NEXT: s_mov_b32 s8, s6
+; GFX1164-NEXT: s_mov_b32 s9, s7
; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB6_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: sub_i32_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT: s_mov_b32 s5, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB6_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1132-NEXT: s_mul_i32 s5, s5, 5
+; GFX1132-NEXT: s_mul_i32 s1, s1, 5
; GFX1132-NEXT: s_mov_b32 s10, -1
-; GFX1132-NEXT: v_mov_b32_e32 v1, s5
+; GFX1132-NEXT: v_mov_b32_e32 v1, s1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mov_b32 s8, s2
-; GFX1132-NEXT: s_mov_b32 s9, s3
+; GFX1132-NEXT: s_mov_b32 s8, s6
+; GFX1132-NEXT: s_mov_b32 s9, s7
; GFX1132-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB6_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX1264-LABEL: sub_i32_constant:
; GFX1264: ; %bb.0: ; %entry
-; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1264-NEXT: s_mov_b64 s[6:7], exec
-; GFX1264-NEXT: s_mov_b64 s[4:5], exec
-; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1264-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264-NEXT: s_mov_b64 s[0:1], exec
+; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1264-NEXT: s_cbranch_execz .LBB6_2
; GFX1264-NEXT: ; %bb.1:
-; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1264-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT: s_mul_i32 s6, s6, 5
+; GFX1264-NEXT: s_mul_i32 s2, s2, 5
; GFX1264-NEXT: s_mov_b32 s10, -1
-; GFX1264-NEXT: v_mov_b32_e32 v1, s6
+; GFX1264-NEXT: v_mov_b32_e32 v1, s2
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mov_b32 s8, s2
-; GFX1264-NEXT: s_mov_b32 s9, s3
+; GFX1264-NEXT: s_mov_b32 s8, s6
+; GFX1264-NEXT: s_mov_b32 s9, s7
; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB6_2:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1264-NEXT: v_readfirstlane_b32 s0, v1
; GFX1264-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264-NEXT: s_wait_kmcnt 0x0
+; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s6, -1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1264-NEXT: s_nop 0
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1264-NEXT: s_endpgm
;
; GFX1232-LABEL: sub_i32_constant:
; GFX1232: ; %bb.0: ; %entry
-; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1232-NEXT: s_mov_b32 s5, exec_lo
-; GFX1232-NEXT: s_mov_b32 s4, exec_lo
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1232-NEXT: s_mov_b32 s2, exec_lo
+; GFX1232-NEXT: s_mov_b32 s0, exec_lo
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1232-NEXT: ; implicit-def: $vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1232-NEXT: s_cbranch_execz .LBB6_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1232-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT: s_mul_i32 s5, s5, 5
+; GFX1232-NEXT: s_mul_i32 s1, s1, 5
; GFX1232-NEXT: s_mov_b32 s10, -1
-; GFX1232-NEXT: v_mov_b32_e32 v1, s5
+; GFX1232-NEXT: v_mov_b32_e32 v1, s1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mov_b32 s8, s2
-; GFX1232-NEXT: s_mov_b32 s9, s3
+; GFX1232-NEXT: s_mov_b32 s8, s6
+; GFX1232-NEXT: s_mov_b32 s9, s7
; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB6_2:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1232-NEXT: v_readfirstlane_b32 s0, v1
; GFX1232-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232-NEXT: s_wait_kmcnt 0x0
+; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s6, -1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1232-NEXT: s_endpgm
@@ -2514,280 +2513,280 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8-LABEL: sub_i32_varying:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_mov_b64 s[2:3], exec
-; GFX8-NEXT: s_mov_b32 s6, 0
+; GFX8-NEXT: s_mov_b32 s8, 0
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB8_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s4, s[2:3]
; GFX8-NEXT: s_mov_b32 m0, s4
-; GFX8-NEXT: v_readlane_b32 s7, v0, s4
+; GFX8-NEXT: v_readlane_b32 s6, v0, s4
; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4
-; GFX8-NEXT: v_writelane_b32 v1, s6, m0
-; GFX8-NEXT: s_add_i32 s6, s6, s7
+; GFX8-NEXT: v_writelane_b32 v1, s8, m0
+; GFX8-NEXT: s_add_i32 s8, s8, s6
; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX8-NEXT: s_cbranch_execz .LBB8_4
; GFX8-NEXT: ; %bb.3:
-; GFX8-NEXT: s_mov_b32 s11, 0xf000
-; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s15, 0xf000
+; GFX8-NEXT: s_mov_b32 s14, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
+; GFX8-NEXT: s_mov_b32 s12, s6
+; GFX8-NEXT: s_mov_b32 s13, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NEXT: buffer_atomic_sub v0, off, s[12:15], 0 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB8_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v1
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v1
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: s_mov_b32 s6, 0
+; GFX9-NEXT: s_mov_b32 s8, 0
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB8_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s4, s[2:3]
; GFX9-NEXT: s_mov_b32 m0, s4
-; GFX9-NEXT: v_readlane_b32 s7, v0, s4
+; GFX9-NEXT: v_readlane_b32 s6, v0, s4
; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4
-; GFX9-NEXT: v_writelane_b32 v1, s6, m0
-; GFX9-NEXT: s_add_i32 s6, s6, s7
+; GFX9-NEXT: v_writelane_b32 v1, s8, m0
+; GFX9-NEXT: s_add_i32 s8, s8, s6
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX9-NEXT: s_cbranch_execz .LBB8_4
; GFX9-NEXT: ; %bb.3:
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s15, 0xf000
+; GFX9-NEXT: s_mov_b32 s14, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: buffer_atomic_sub v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: .LBB8_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_sub_u32_e32 v0, s4, v1
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v1
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i32_varying:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: s_mov_b32 s6, 0
+; GFX1064-NEXT: s_mov_b32 s8, 0
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s7, s[2:3]
-; GFX1064-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX1064-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1064-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX1064-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s6
+; GFX1064-NEXT: v_writelane_b32 v1, s8, s6
; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX1064-NEXT: s_add_i32 s6, s6, s8
+; GFX1064-NEXT: s_add_i32 s8, s8, s7
; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execz .LBB8_4
; GFX1064-NEXT: ; %bb.3:
-; GFX1064-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064-NEXT: v_mov_b32_e32 v0, s8
; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
; GFX1064-NEXT: s_mov_b32 s10, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s8, s2
-; GFX1064-NEXT: s_mov_b32 s9, s3
+; GFX1064-NEXT: s_mov_b32 s8, s6
+; GFX1064-NEXT: s_mov_b32 s9, s7
; GFX1064-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB8_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v1
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i32_varying:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s3, s2
-; GFX1032-NEXT: v_readlane_b32 s5, v0, s3
-; GFX1032-NEXT: s_lshl_b32 s6, 1, s3
-; GFX1032-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1032-NEXT: s_andn2_b32 s2, s2, s6
-; GFX1032-NEXT: s_add_i32 s4, s4, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1032-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1032-NEXT: s_andn2_b32 s3, s3, s6
+; GFX1032-NEXT: s_add_i32 s2, s2, s5
+; GFX1032-NEXT: s_cmp_lg_u32 s3, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s5, exec_lo, s5
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1032-NEXT: s_cbranch_execz .LBB8_4
; GFX1032-NEXT: ; %bb.3:
-; GFX1032-NEXT: v_mov_b32_e32 v0, s4
+; GFX1032-NEXT: v_mov_b32_e32 v0, s2
; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mov_b32 s8, s2
-; GFX1032-NEXT: s_mov_b32 s9, s3
+; GFX1032-NEXT: s_mov_b32 s8, s6
+; GFX1032-NEXT: s_mov_b32 s9, s7
; GFX1032-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB8_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v1
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i32_varying:
; GFX1164: ; %bb.0: ; %entry
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: s_mov_b32 s6, 0
+; GFX1164-NEXT: s_mov_b32 s8, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164-NEXT: s_ctz_i32_b64 s6, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX1164-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1164-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s6
+; GFX1164-NEXT: v_writelane_b32 v1, s8, s6
; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1164-NEXT: s_add_i32 s6, s6, s8
+; GFX1164-NEXT: s_add_i32 s8, s8, s7
; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execz .LBB8_4
; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164-NEXT: v_mov_b32_e32 v0, s8
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
; GFX1164-NEXT: s_mov_b32 s10, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s8, s2
-; GFX1164-NEXT: s_mov_b32 s9, s3
+; GFX1164-NEXT: s_mov_b32 s8, s6
+; GFX1164-NEXT: s_mov_b32 s9, s7
; GFX1164-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB8_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v1
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: sub_i32_varying:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
; GFX1132-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132-NEXT: s_ctz_i32_b32 s4, s3
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s3
-; GFX1132-NEXT: s_lshl_b32 s6, 1, s3
-; GFX1132-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1132-NEXT: s_and_not1_b32 s2, s2, s6
-; GFX1132-NEXT: s_add_i32 s4, s4, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1132-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6
+; GFX1132-NEXT: s_add_i32 s2, s2, s5
+; GFX1132-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5
+; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1132-NEXT: s_cbranch_execz .LBB8_4
; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, s4
+; GFX1132-NEXT: v_mov_b32_e32 v0, s2
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
; GFX1132-NEXT: s_mov_b32 s10, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mov_b32 s8, s2
-; GFX1132-NEXT: s_mov_b32 s9, s3
+; GFX1132-NEXT: s_mov_b32 s8, s6
+; GFX1132-NEXT: s_mov_b32 s9, s7
; GFX1132-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB8_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v1
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -2795,97 +2794,97 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-LABEL: sub_i32_varying:
; GFX1264: ; %bb.0: ; %entry
; GFX1264-NEXT: s_mov_b64 s[2:3], exec
-; GFX1264-NEXT: s_mov_b32 s6, 0
+; GFX1264-NEXT: s_mov_b32 s8, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
; GFX1264-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1264-NEXT: s_ctz_i32_b64 s6, s[2:3]
; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1264-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1264-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX1264-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1264-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1264-NEXT: s_lshl_b64 s[4:5], 1, s6
+; GFX1264-NEXT: v_writelane_b32 v1, s8, s6
; GFX1264-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1264-NEXT: s_add_co_i32 s6, s6, s8
+; GFX1264-NEXT: s_add_co_i32 s8, s8, s7
; GFX1264-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1264-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1264-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0
-; GFX1264-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1264-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1264-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1264-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1264-NEXT: s_cbranch_execz .LBB8_4
; GFX1264-NEXT: ; %bb.3:
-; GFX1264-NEXT: v_mov_b32_e32 v0, s6
+; GFX1264-NEXT: v_mov_b32_e32 v0, s8
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264-NEXT: s_mov_b32 s10, -1
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mov_b32 s8, s2
-; GFX1264-NEXT: s_mov_b32 s9, s3
+; GFX1264-NEXT: s_mov_b32 s8, s6
+; GFX1264-NEXT: s_mov_b32 s9, s7
; GFX1264-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB8_4:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1264-NEXT: v_readfirstlane_b32 s0, v0
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s6, -1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s0, v1
+; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1264-NEXT: s_nop 0
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1264-NEXT: s_endpgm
;
; GFX1232-LABEL: sub_i32_varying:
; GFX1232: ; %bb.0: ; %entry
-; GFX1232-NEXT: s_mov_b32 s2, exec_lo
-; GFX1232-NEXT: s_mov_b32 s4, 0
+; GFX1232-NEXT: s_mov_b32 s3, exec_lo
+; GFX1232-NEXT: s_mov_b32 s2, 0
; GFX1232-NEXT: ; implicit-def: $vgpr1
; GFX1232-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1232-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1232-NEXT: s_ctz_i32_b32 s4, s3
; GFX1232-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1232-NEXT: v_readlane_b32 s5, v0, s3
-; GFX1232-NEXT: s_lshl_b32 s6, 1, s3
-; GFX1232-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1232-NEXT: s_and_not1_b32 s2, s2, s6
-; GFX1232-NEXT: s_add_co_i32 s4, s4, s5
-; GFX1232-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1232-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1232-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1232-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1232-NEXT: s_and_not1_b32 s3, s3, s6
+; GFX1232-NEXT: s_add_co_i32 s2, s2, s5
+; GFX1232-NEXT: s_cmp_lg_u32 s3, 0
; GFX1232-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232-NEXT: ; implicit-def: $vgpr0
-; GFX1232-NEXT: s_and_saveexec_b32 s5, vcc_lo
-; GFX1232-NEXT: s_xor_b32 s5, exec_lo, s5
+; GFX1232-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1232-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1232-NEXT: s_cbranch_execz .LBB8_4
; GFX1232-NEXT: ; %bb.3:
-; GFX1232-NEXT: v_mov_b32_e32 v0, s4
+; GFX1232-NEXT: v_mov_b32_e32 v0, s2
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
; GFX1232-NEXT: s_mov_b32 s10, -1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mov_b32 s8, s2
-; GFX1232-NEXT: s_mov_b32 s9, s3
+; GFX1232-NEXT: s_mov_b32 s8, s6
+; GFX1232-NEXT: s_mov_b32 s9, s7
; GFX1232-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB8_4:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1232-NEXT: v_readfirstlane_b32 s0, v0
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s6, -1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s0, v1
+; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1232-NEXT: s_endpgm
@@ -2938,317 +2937,313 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
;
; GFX8-LABEL: sub_i64_constant:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB9_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX8-NEXT: s_mul_i32 s2, s2, 5
; GFX8-NEXT: s_mov_b32 s11, 0xf000
; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB9_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i64_constant:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB9_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX9-NEXT: s_mul_i32 s2, s2, 5
; GFX9-NEXT: s_mov_b32 s11, 0xf000
; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: .LBB9_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i64_constant:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB9_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: s_mul_i32 s6, s6, 5
+; GFX1064-NEXT: s_mul_i32 s2, s2, 5
; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064-NEXT: v_mov_b32_e32 v0, s2
; GFX1064-NEXT: s_mov_b32 s10, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s8, s2
-; GFX1064-NEXT: s_mov_b32 s9, s3
+; GFX1064-NEXT: s_mov_b32 s8, s6
+; GFX1064-NEXT: s_mov_b32 s9, s7
; GFX1064-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB9_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v0
+; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i64_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB9_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: s_mul_i32 s5, s5, 5
+; GFX1032-NEXT: s_mul_i32 s1, s1, 5
; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT: v_mov_b32_e32 v0, s5
+; GFX1032-NEXT: v_mov_b32_e32 v0, s1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mov_b32 s8, s2
-; GFX1032-NEXT: s_mov_b32 s9, s3
+; GFX1032-NEXT: s_mov_b32 s8, s6
+; GFX1032-NEXT: s_mov_b32 s9, s7
; GFX1032-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB9_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i64_constant:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1164-NEXT: s_cbranch_execz .LBB9_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: s_mul_i32 s6, s6, 5
+; GFX1164-NEXT: s_mul_i32 s2, s2, 5
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1164-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164-NEXT: v_mov_b32_e32 v0, s2
; GFX1164-NEXT: s_mov_b32 s10, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s8, s2
-; GFX1164-NEXT: s_mov_b32 s9, s3
+; GFX1164-NEXT: s_mov_b32 s8, s6
+; GFX1164-NEXT: s_mov_b32 s9, s7
; GFX1164-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB9_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
+; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s0, v0
+; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: sub_i64_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT: s_mov_b32 s5, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB9_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1132-NEXT: s_mul_i32 s5, s5, 5
+; GFX1132-NEXT: s_mul_i32 s1, s1, 5
; GFX1132-NEXT: s_mov_b32 s10, -1
-; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mov_b32 s8, s2
-; GFX1132-NEXT: s_mov_b32 s9, s3
+; GFX1132-NEXT: s_mov_b32 s8, s6
+; GFX1132-NEXT: s_mov_b32 s9, s7
; GFX1132-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB9_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
+; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX1264-LABEL: sub_i64_constant:
; GFX1264: ; %bb.0: ; %entry
-; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1264-NEXT: s_mov_b64 s[6:7], exec
+; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1264-NEXT: s_mov_b64 s[2:3], exec
; GFX1264-NEXT: s_mov_b32 s9, 0
-; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1264-NEXT: s_mov_b64 s[4:5], exec
+; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1264-NEXT: s_mov_b64 s[0:1], exec
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1264-NEXT: s_cbranch_execz .LBB9_2
; GFX1264-NEXT: ; %bb.1:
-; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7]
+; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[2:3]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5
+; GFX1264-NEXT: s_mul_u64 s[2:3], s[8:9], 5
; GFX1264-NEXT: s_mov_b32 s10, -1
-; GFX1264-NEXT: v_mov_b32_e32 v0, s6
-; GFX1264-NEXT: v_mov_b32_e32 v1, s7
+; GFX1264-NEXT: v_mov_b32_e32 v0, s2
+; GFX1264-NEXT: v_mov_b32_e32 v1, s3
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mov_b32 s8, s2
-; GFX1264-NEXT: s_mov_b32 s9, s3
+; GFX1264-NEXT: s_mov_b32 s8, s6
+; GFX1264-NEXT: s_mov_b32 s9, s7
; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB9_2:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1264-NEXT: v_readfirstlane_b32 s0, v0
; GFX1264-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1264-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1264-NEXT: v_readfirstlane_b32 s1, v1
; GFX1264-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1264-NEXT: s_wait_kmcnt 0x0
+; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s6, -1
+; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s0, v0
+; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
+; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX1264-NEXT: s_nop 0
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1264-NEXT: s_endpgm
;
; GFX1232-LABEL: sub_i64_constant:
; GFX1232: ; %bb.0: ; %entry
-; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1232-NEXT: s_mov_b32 s4, exec_lo
-; GFX1232-NEXT: s_mov_b32 s5, 0
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0
-; GFX1232-NEXT: s_mov_b32 s6, exec_lo
+; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1232-NEXT: s_mov_b32 s3, exec_lo
+; GFX1232-NEXT: s_mov_b32 s1, 0
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
+; GFX1232-NEXT: s_mov_b32 s2, exec_lo
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB9_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1232-NEXT: s_bcnt1_i32_b32 s0, s3
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5
+; GFX1232-NEXT: s_mul_u64 s[0:1], s[0:1], 5
; GFX1232-NEXT: s_mov_b32 s10, -1
-; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1232-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mov_b32 s8, s2
-; GFX1232-NEXT: s_mov_b32 s9, s3
+; GFX1232-NEXT: s_mov_b32 s8, s6
+; GFX1232-NEXT: s_mov_b32 s9, s7
; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB9_2:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1232-NEXT: v_readfirstlane_b32 s0, v0
; GFX1232-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1232-NEXT: v_readfirstlane_b32 s1, v1
; GFX1232-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1232-NEXT: s_wait_kmcnt 0x0
+; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s6, -1
+; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1232-NEXT: s_endpgm
@@ -3307,21 +3302,21 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-LABEL: sub_i64_uniform:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: s_mov_b64 s[8:9], exec
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB10_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s12, s6
; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9]
; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0
-; GFX8-NEXT: s_mul_i32 s6, s1, s6
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s2, v0, 0
+; GFX8-NEXT: s_mul_i32 s6, s3, s6
; GFX8-NEXT: s_mov_b32 s15, 0xf000
; GFX8-NEXT: s_mov_b32 s14, -1
; GFX8-NEXT: s_mov_b32 s13, s7
@@ -3330,10 +3325,10 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB10_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_lo_u32 v4, s1, v2
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s0, v2, 0
+; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4
@@ -3481,9 +3476,9 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164: ; %bb.0: ; %entry
; GFX1164-NEXT: s_clause 0x1
; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1164-NEXT: s_mov_b64 s[8:9], exec
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
@@ -3494,9 +3489,9 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: s_bcnt1_i32_b64 s8, s[8:9]
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mul_i32 s9, s1, s8
-; GFX1164-NEXT: s_mul_hi_u32 s10, s0, s8
-; GFX1164-NEXT: s_mul_i32 s8, s0, s8
+; GFX1164-NEXT: s_mul_i32 s9, s3, s8
+; GFX1164-NEXT: s_mul_hi_u32 s10, s2, s8
+; GFX1164-NEXT: s_mul_i32 s8, s2, s8
; GFX1164-NEXT: s_add_i32 s10, s10, s9
; GFX1164-NEXT: v_mov_b32_e32 v0, s8
; GFX1164-NEXT: v_mov_b32_e32 v1, s10
@@ -3508,17 +3503,17 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB10_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s0, v2, 0
+; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0
; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s1, v2, v[4:5]
-; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s0, v3
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v1, v5
; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
@@ -3530,24 +3525,24 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132: ; %bb.0: ; %entry
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX1132-NEXT: s_mov_b32 s8, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB10_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s8
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mul_i32 s8, s1, s3
-; GFX1132-NEXT: s_mul_hi_u32 s9, s0, s3
-; GFX1132-NEXT: s_mul_i32 s3, s0, s3
+; GFX1132-NEXT: s_mul_i32 s8, s3, s1
+; GFX1132-NEXT: s_mul_hi_u32 s9, s2, s1
+; GFX1132-NEXT: s_mul_i32 s1, s2, s1
; GFX1132-NEXT: s_add_i32 s9, s9, s8
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s9
+; GFX1132-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s9
; GFX1132-NEXT: s_mov_b32 s10, -1
; GFX1132-NEXT: s_mov_b32 s8, s6
; GFX1132-NEXT: s_mov_b32 s9, s7
@@ -3556,17 +3551,17 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB10_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s0, v2, 0
+; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0
; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_mov_b32 s6, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s1, v2, v[4:5]
-; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v1, v5
; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
@@ -3578,11 +3573,11 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264: ; %bb.0: ; %entry
; GFX1264-NEXT: s_clause 0x1
; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1264-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1264-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1264-NEXT: s_mov_b64 s[8:9], exec
; GFX1264-NEXT: s_mov_b32 s11, 0
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1264-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264-NEXT: s_mov_b64 s[0:1], exec
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -3591,7 +3586,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: ; %bb.1:
; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9]
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mul_u64 s[8:9], s[0:1], s[10:11]
+; GFX1264-NEXT: s_mul_u64 s[8:9], s[2:3], s[10:11]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264-NEXT: v_mov_b32_e32 v0, s8
; GFX1264-NEXT: v_mov_b32_e32 v1, s9
@@ -3602,17 +3597,17 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB10_2:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0
+; GFX1264-NEXT: v_mad_co_u64_u32 v[3:4], null, s2, v2, 0
; GFX1264-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1264-NEXT: v_readfirstlane_b32 s1, v1
; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
; GFX1264-NEXT: s_mov_b32 s6, -1
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1264-NEXT: v_mad_co_u64_u32 v[4:5], null, s1, v2, v[4:5]
-; GFX1264-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1264-NEXT: v_mad_co_u64_u32 v[4:5], null, s3, v2, v[4:5]
; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s0, v3
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mov_b32_e32 v1, v4
; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
@@ -3624,22 +3619,22 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232: ; %bb.0: ; %entry
; GFX1232-NEXT: s_clause 0x1
; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1232-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX1232-NEXT: s_mov_b32 s2, exec_lo
-; GFX1232-NEXT: s_mov_b32 s3, 0
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
+; GFX1232-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX1232-NEXT: s_mov_b32 s9, exec_lo
+; GFX1232-NEXT: s_mov_b32 s1, 0
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0
; GFX1232-NEXT: s_mov_b32 s8, exec_lo
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB10_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1232-NEXT: s_bcnt1_i32_b32 s0, s9
; GFX1232-NEXT: s_mov_b32 s15, 0x31016000
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mul_u64 s[2:3], s[0:1], s[2:3]
+; GFX1232-NEXT: s_mul_u64 s[0:1], s[2:3], s[0:1]
; GFX1232-NEXT: s_mov_b32 s14, -1
-; GFX1232-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1232-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX1232-NEXT: s_mov_b32 s12, s6
; GFX1232-NEXT: s_mov_b32 s13, s7
; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN
@@ -3648,15 +3643,15 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: .LBB10_2:
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0
+; GFX1232-NEXT: v_mad_co_u64_u32 v[3:4], null, s2, v2, 0
; GFX1232-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1232-NEXT: v_readfirstlane_b32 s1, v1
; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
; GFX1232-NEXT: s_mov_b32 s6, -1
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1232-NEXT: v_mad_co_u64_u32 v[4:5], null, s1, v2, v[4:5]
-; GFX1232-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1232-NEXT: v_mad_co_u64_u32 v[4:5], null, s3, v2, v[4:5]
; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1232-NEXT: v_mov_b32_e32 v1, v4
; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
@@ -3691,82 +3686,82 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX89-LABEL: sub_i64_varying:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: v_mov_b32_e32 v1, 0
; GFX89-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: buffer_wbinvl1_vol
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
-; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
+; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX10-LABEL: sub_i64_varying:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s11, s7
-; GFX10-NEXT: s_mov_b32 s10, s6
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s11, s3
+; GFX10-NEXT: s_mov_b32 s10, s2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_mov_b32 s5, s1
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT: s_mov_b32 s1, s5
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: sub_i64_varying:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_mov_b32 s10, s6
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: s_mov_b32 s1, s5
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: sub_i64_varying:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s11, s7
-; GFX12-NEXT: s_mov_b32 s10, s6
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s11, s3
+; GFX12-NEXT: s_mov_b32 s10, s2
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
+; GFX12-NEXT: s_mov_b32 s0, s4
; GFX12-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_mov_b32 s5, s1
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: s_mov_b32 s1, s5
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index af6f6913..98a28b2 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -64,13 +64,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB0_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v1
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_constant:
@@ -91,13 +91,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB0_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i32_constant:
@@ -120,13 +120,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB0_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i32_constant:
@@ -148,13 +149,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB0_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i32_constant:
@@ -178,14 +180,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB0_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -209,14 +211,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB0_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -281,14 +283,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB1_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX8-NEXT: v_readfirstlane_b32 s4, v1
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_uniform:
@@ -311,14 +313,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB1_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_add_u32_e32 v0, s4, v0
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i32_uniform:
@@ -343,13 +345,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX1064-NEXT: .LBB1_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1]
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i32_uniform:
@@ -373,13 +376,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX1032-NEXT: .LBB1_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5]
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i32_uniform:
@@ -405,14 +409,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB1_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b32 v1, off, s[0:3], 0
+; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1]
+; GFX1164-NEXT: s_mov_b32 s6, -1
+; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -438,14 +442,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB1_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v1, off, s[0:3], 0
+; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[0:1]
+; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -503,13 +507,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB2_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_varying:
@@ -543,13 +547,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB2_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_add_u32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i32_varying:
@@ -584,13 +588,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB2_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i32_varying:
@@ -624,13 +629,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB2_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i32_varying:
@@ -667,14 +673,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB2_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_add_nc_u32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -711,14 +717,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB2_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_add_nc_u32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -974,17 +980,16 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB4_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_readfirstlane_b32 s3, v1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 5, v[0:1]
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_nop 1
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i64_constant:
@@ -1005,17 +1010,16 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB4_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
-; GFX9-NEXT: v_readfirstlane_b32 s3, v1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 5, v[0:1]
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i64_constant:
@@ -1038,14 +1042,15 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB4_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 5, s[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i64_constant:
@@ -1067,14 +1072,15 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB4_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, v2, 5, s[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i64_constant:
@@ -1098,15 +1104,15 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB4_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -1131,15 +1137,15 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB4_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -1196,228 +1202,229 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
;
; GFX8-LABEL: add_i64_uniform:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB5_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7]
+; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[2:3]
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0
-; GFX8-NEXT: s_mul_i32 s6, s3, s8
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, 0
+; GFX8-NEXT: s_mul_i32 s2, s7, s8
; GFX8-NEXT: v_mov_b32_e32 v3, 0
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB5_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_lo_u32 v3, s3, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: v_mul_lo_u32 v3, s7, v2
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v2, v[0:1]
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i64_uniform:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB5_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s7, s3, s6
-; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6
-; GFX9-NEXT: s_add_i32 s8, s8, s7
-; GFX9-NEXT: s_mul_i32 s6, s2, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: s_mul_i32 s3, s7, s2
+; GFX9-NEXT: s_mul_hi_u32 s8, s6, s2
+; GFX9-NEXT: s_add_i32 s8, s8, s3
+; GFX9-NEXT: s_mul_i32 s2, s6, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s8
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB5_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v2, v[0:1]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v2, v[1:2]
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_nop 2
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i64_uniform:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB5_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mul_i32 s7, s3, s6
-; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6
-; GFX1064-NEXT: s_mul_i32 s6, s2, s6
-; GFX1064-NEXT: s_add_i32 s8, s8, s7
-; GFX1064-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064-NEXT: s_mul_i32 s3, s7, s2
+; GFX1064-NEXT: s_mul_hi_u32 s8, s6, s2
+; GFX1064-NEXT: s_mul_i32 s2, s6, s2
+; GFX1064-NEXT: s_add_i32 s8, s8, s3
+; GFX1064-NEXT: v_mov_b32_e32 v0, s2
; GFX1064-NEXT: v_mov_b32_e32 v1, s8
; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB5_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1064-NEXT: v_readfirstlane_b32 s4, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s5, v1
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5]
-; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v2, s[0:1]
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v2, v[1:2]
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i64_uniform:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB5_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mul_i32 s6, s3, s5
-; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5
-; GFX1032-NEXT: s_mul_i32 s5, s2, s5
-; GFX1032-NEXT: s_add_i32 s7, s7, s6
-; GFX1032-NEXT: v_mov_b32_e32 v0, s5
-; GFX1032-NEXT: v_mov_b32_e32 v1, s7
+; GFX1032-NEXT: s_mul_i32 s2, s7, s1
+; GFX1032-NEXT: s_mul_hi_u32 s3, s6, s1
+; GFX1032-NEXT: s_mul_i32 s1, s6, s1
+; GFX1032-NEXT: s_add_i32 s3, s3, s2
+; GFX1032-NEXT: v_mov_b32_e32 v0, s1
+; GFX1032-NEXT: v_mov_b32_e32 v1, s3
; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB5_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT: v_readfirstlane_b32 s4, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s5, v1
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5]
-; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2]
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s6, v2, s[0:1]
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s0, s7, v2, v[1:2]
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i64_uniform:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1164-NEXT: s_cbranch_execz .LBB5_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mul_i32 s7, s3, s6
-; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6
-; GFX1164-NEXT: s_mul_i32 s6, s2, s6
-; GFX1164-NEXT: s_add_i32 s8, s8, s7
-; GFX1164-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164-NEXT: s_mul_i32 s3, s7, s2
+; GFX1164-NEXT: s_mul_hi_u32 s8, s6, s2
+; GFX1164-NEXT: s_mul_i32 s2, s6, s2
+; GFX1164-NEXT: s_add_i32 s8, s8, s3
+; GFX1164-NEXT: v_mov_b32_e32 v0, s2
; GFX1164-NEXT: v_mov_b32_e32 v1, s8
; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB5_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1164-NEXT: v_readfirstlane_b32 s4, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s5, v1
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s6, v2, s[0:1]
+; GFX1164-NEXT: s_mov_b32 s6, -1
+; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s7, v2, v[1:2]
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v1, v3
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: add_i64_uniform:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT: s_mov_b32 s5, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB5_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mul_i32 s6, s3, s5
-; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5
-; GFX1132-NEXT: s_mul_i32 s5, s2, s5
-; GFX1132-NEXT: s_add_i32 s7, s7, s6
+; GFX1132-NEXT: s_mul_i32 s2, s7, s1
+; GFX1132-NEXT: s_mul_hi_u32 s3, s6, s1
+; GFX1132-NEXT: s_mul_i32 s1, s6, s1
+; GFX1132-NEXT: s_add_i32 s3, s3, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s7
+; GFX1132-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s3
; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB5_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT: v_readfirstlane_b32 s4, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s5, v1
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s6, v2, s[0:1]
+; GFX1132-NEXT: s_mov_b32 s6, -1
+; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s7, v2, v[1:2]
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v1, v3
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -1447,51 +1454,51 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i64_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: add_i64_varying:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: add_i64_varying:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1554,14 +1561,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB7_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_constant:
@@ -1582,14 +1589,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB7_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i32_constant:
@@ -1612,14 +1619,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB7_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i32_constant:
@@ -1641,14 +1649,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB7_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i32_constant:
@@ -1672,15 +1681,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB7_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -1704,15 +1713,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB7_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -1777,14 +1786,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB8_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX8-NEXT: v_readfirstlane_b32 s4, v1
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_uniform:
@@ -1807,14 +1816,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB8_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i32_uniform:
@@ -1839,14 +1848,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1064-NEXT: .LBB8_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i32_uniform:
@@ -1870,14 +1879,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1032-NEXT: .LBB8_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i32_uniform:
@@ -1903,15 +1912,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB8_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -1937,15 +1946,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB8_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: v_mul_lo_u32 v0, s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -2003,13 +2012,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB9_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_varying:
@@ -2043,13 +2052,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB9_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_sub_u32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i32_varying:
@@ -2084,13 +2093,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB9_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i32_varying:
@@ -2124,13 +2134,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB9_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i32_varying:
@@ -2167,14 +2178,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB9_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -2211,14 +2222,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB9_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -2474,18 +2485,18 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB11_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i64_constant:
@@ -2506,18 +2517,18 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB11_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i64_constant:
@@ -2540,17 +2551,18 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB11_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v0
+; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i64_constant:
@@ -2572,17 +2584,18 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB11_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i64_constant:
@@ -2606,18 +2619,18 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB11_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
+; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s0, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -2642,18 +2655,18 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB11_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
+; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -2710,241 +2723,241 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
;
; GFX8-LABEL: sub_i64_uniform:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB12_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7]
+; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[2:3]
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0
-; GFX8-NEXT: s_mul_i32 s6, s3, s8
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, 0
+; GFX8-NEXT: s_mul_i32 s2, s7, s8
; GFX8-NEXT: v_mov_b32_e32 v3, 0
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB12_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
-; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
+; GFX8-NEXT: v_mul_lo_u32 v4, s7, v2
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v2, 0
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: v_readfirstlane_b32 s5, v1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i64_uniform:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB12_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s7, s3, s6
-; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6
-; GFX9-NEXT: s_add_i32 s8, s8, s7
-; GFX9-NEXT: s_mul_i32 s6, s2, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: s_mul_i32 s3, s7, s2
+; GFX9-NEXT: s_mul_hi_u32 s8, s6, s2
+; GFX9-NEXT: s_add_i32 s8, s8, s3
+; GFX9-NEXT: s_mul_i32 s2, s6, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s8
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB12_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s6, v2, 0
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[6:7], s7, v2, v[4:5]
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_mov_b32_e32 v1, v4
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v3
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v3
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i64_uniform:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB12_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mul_i32 s7, s3, s6
-; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6
-; GFX1064-NEXT: s_mul_i32 s6, s2, s6
-; GFX1064-NEXT: s_add_i32 s8, s8, s7
-; GFX1064-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064-NEXT: s_mul_i32 s3, s7, s2
+; GFX1064-NEXT: s_mul_hi_u32 s8, s6, s2
+; GFX1064-NEXT: s_mul_i32 s2, s6, s2
+; GFX1064-NEXT: s_add_i32 s8, s8, s3
+; GFX1064-NEXT: v_mov_b32_e32 v0, s2
; GFX1064-NEXT: v_mov_b32_e32 v1, s8
; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB12_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
-; GFX1064-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3
+; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s6, v2, 0
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s7, v2, v[4:5]
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v3
; GFX1064-NEXT: v_mov_b32_e32 v1, v4
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i64_uniform:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB12_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mul_i32 s6, s3, s5
-; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5
-; GFX1032-NEXT: s_mul_i32 s5, s2, s5
-; GFX1032-NEXT: s_add_i32 s7, s7, s6
-; GFX1032-NEXT: v_mov_b32_e32 v0, s5
-; GFX1032-NEXT: v_mov_b32_e32 v1, s7
+; GFX1032-NEXT: s_mul_i32 s2, s7, s1
+; GFX1032-NEXT: s_mul_hi_u32 s3, s6, s1
+; GFX1032-NEXT: s_mul_i32 s1, s6, s1
+; GFX1032-NEXT: s_add_i32 s3, s3, s2
+; GFX1032-NEXT: v_mov_b32_e32 v0, s1
+; GFX1032-NEXT: v_mov_b32_e32 v1, s3
; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB12_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0
-; GFX1032-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5]
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3
+; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s0, s6, v2, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s0, s7, v2, v[4:5]
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3
; GFX1032-NEXT: v_mov_b32_e32 v1, v4
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i64_uniform:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1164-NEXT: s_cbranch_execz .LBB12_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mul_i32 s7, s3, s6
-; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6
-; GFX1164-NEXT: s_mul_i32 s6, s2, s6
-; GFX1164-NEXT: s_add_i32 s8, s8, s7
-; GFX1164-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164-NEXT: s_mul_i32 s3, s7, s2
+; GFX1164-NEXT: s_mul_hi_u32 s8, s6, s2
+; GFX1164-NEXT: s_mul_i32 s2, s6, s2
+; GFX1164-NEXT: s_add_i32 s8, s8, s3
+; GFX1164-NEXT: v_mov_b32_e32 v0, s2
; GFX1164-NEXT: v_mov_b32_e32 v1, s8
; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB12_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s6, v2, 0
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
-; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s7, v2, v[4:5]
+; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s0, v3
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v1, v5
-; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: sub_i64_uniform:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT: s_mov_b32 s5, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB12_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mul_i32 s6, s3, s5
-; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5
-; GFX1132-NEXT: s_mul_i32 s5, s2, s5
-; GFX1132-NEXT: s_add_i32 s7, s7, s6
+; GFX1132-NEXT: s_mul_i32 s2, s7, s1
+; GFX1132-NEXT: s_mul_hi_u32 s3, s6, s1
+; GFX1132-NEXT: s_mul_i32 s1, s6, s1
+; GFX1132-NEXT: s_add_i32 s3, s3, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s7
+; GFX1132-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s3
; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB12_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s6, v2, 0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
-; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s7, v2, v[4:5]
+; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v1, v5
-; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -2974,51 +2987,51 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i64_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sub_i64_varying:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: sub_i64_varying:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3078,13 +3091,13 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB14_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_and_b32_e32 v0, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_and_b32_e32 v0, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: and_i32_varying:
@@ -3118,13 +3131,13 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB14_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_and_b32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_and_b32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: and_i32_varying:
@@ -3159,13 +3172,14 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB14_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_and_b32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_and_b32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: and_i32_varying:
@@ -3199,13 +3213,14 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB14_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_and_b32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_and_b32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: and_i32_varying:
@@ -3242,14 +3257,14 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB14_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_and_b32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_and_b32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -3286,14 +3301,14 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB14_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_and_b32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_and_b32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -3352,13 +3367,13 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB15_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_or_b32_e32 v0, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_or_b32_e32 v0, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: or_i32_varying:
@@ -3392,13 +3407,13 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB15_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_or_b32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_or_b32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: or_i32_varying:
@@ -3433,13 +3448,14 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB15_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_or_b32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_or_b32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: or_i32_varying:
@@ -3473,13 +3489,14 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB15_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_or_b32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_or_b32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: or_i32_varying:
@@ -3516,14 +3533,14 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB15_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_or_b32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_or_b32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -3560,14 +3577,14 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB15_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_or_b32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_or_b32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -3626,13 +3643,13 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB16_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_xor_b32_e32 v0, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_xor_b32_e32 v0, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: xor_i32_varying:
@@ -3666,13 +3683,13 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB16_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_xor_b32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_xor_b32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: xor_i32_varying:
@@ -3707,13 +3724,14 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB16_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_xor_b32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_xor_b32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: xor_i32_varying:
@@ -3747,13 +3765,14 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB16_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_xor_b32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_xor_b32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: xor_i32_varying:
@@ -3790,14 +3809,14 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB16_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_xor_b32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_xor_b32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -3834,14 +3853,14 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB16_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_xor_b32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_xor_b32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -3900,13 +3919,13 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB17_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_max_i32_e32 v0, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_max_i32_e32 v0, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: max_i32_varying:
@@ -3940,13 +3959,13 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB17_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_max_i32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_max_i32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: max_i32_varying:
@@ -3981,13 +4000,14 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB17_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_max_i32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_max_i32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: max_i32_varying:
@@ -4021,13 +4041,14 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB17_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_max_i32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_max_i32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: max_i32_varying:
@@ -4064,14 +4085,14 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB17_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_max_i32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_max_i32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -4108,14 +4129,14 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB17_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_max_i32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_max_i32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -4180,21 +4201,21 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB18_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: v_bfrev_b32_e32 v0, 1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: max_i64_constant:
@@ -4213,21 +4234,21 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB18_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_bfrev_b32_e32 v0, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: max_i64_constant:
@@ -4248,18 +4269,19 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB18_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc
+; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: max_i64_constant:
@@ -4279,18 +4301,19 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB18_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
-; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc_lo
+; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: max_i64_constant:
@@ -4311,19 +4334,19 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB18_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc
+; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -4344,19 +4367,19 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB18_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc_lo
+; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -4414,13 +4437,13 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB19_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_min_i32_e32 v0, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_min_i32_e32 v0, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: min_i32_varying:
@@ -4454,13 +4477,13 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB19_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_min_i32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_min_i32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: min_i32_varying:
@@ -4495,13 +4518,14 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB19_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_min_i32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_min_i32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: min_i32_varying:
@@ -4535,13 +4559,14 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB19_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_min_i32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_min_i32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: min_i32_varying:
@@ -4578,14 +4603,14 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB19_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_min_i32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_min_i32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -4622,14 +4647,14 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB19_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_min_i32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_min_i32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -4694,21 +4719,21 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB20_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: v_bfrev_b32_e32 v0, -2
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: min_i64_constant:
@@ -4727,21 +4752,21 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB20_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_bfrev_b32_e32 v0, -2
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: min_i64_constant:
@@ -4762,18 +4787,19 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB20_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc
+; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: min_i64_constant:
@@ -4793,18 +4819,19 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB20_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
-; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc_lo
+; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: min_i64_constant:
@@ -4825,19 +4852,19 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB20_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc
+; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -4858,19 +4885,19 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB20_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc_lo
+; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -4928,13 +4955,13 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB21_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_max_u32_e32 v0, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_max_u32_e32 v0, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: umax_i32_varying:
@@ -4968,13 +4995,13 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB21_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_max_u32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_max_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: umax_i32_varying:
@@ -5009,13 +5036,14 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB21_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_max_u32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_max_u32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: umax_i32_varying:
@@ -5049,13 +5077,14 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB21_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_max_u32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_max_u32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: umax_i32_varying:
@@ -5092,14 +5121,14 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB21_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_max_u32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_max_u32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -5136,14 +5165,14 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB21_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_max_u32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_max_u32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -5207,20 +5236,20 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB22_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: umax_i64_constant:
@@ -5239,20 +5268,20 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB22_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: umax_i64_constant:
@@ -5273,18 +5302,19 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB22_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc
+; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s1, vcc
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: umax_i64_constant:
@@ -5304,18 +5334,19 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB22_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
-; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
+; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s1, vcc_lo
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: umax_i64_constant:
@@ -5336,19 +5367,19 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB22_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc
+; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, s1, vcc
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -5369,19 +5400,19 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB22_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
+; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, s1, vcc_lo
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -5439,13 +5470,13 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB23_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_min_u32_e32 v0, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_min_u32_e32 v0, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: umin_i32_varying:
@@ -5479,13 +5510,13 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB23_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_min_u32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_min_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: umin_i32_varying:
@@ -5520,13 +5551,14 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB23_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_min_u32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_min_u32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: umin_i32_varying:
@@ -5560,13 +5592,14 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB23_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_min_u32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_min_u32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: umin_i32_varying:
@@ -5603,14 +5636,14 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB23_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_min_u32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_min_u32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -5647,14 +5680,14 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB23_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_min_u32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_min_u32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -5718,20 +5751,20 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB24_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: umin_i64_constant:
@@ -5750,20 +5783,20 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB24_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: umin_i64_constant:
@@ -5784,18 +5817,19 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB24_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc
+; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: umin_i64_constant:
@@ -5815,18 +5849,19 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB24_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
-; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc_lo
+; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: umin_i64_constant:
@@ -5847,19 +5882,19 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB24_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc
+; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -5880,19 +5915,19 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB24_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc_lo
+; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index ca94d68..aa5c480 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -62,13 +62,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB0_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -90,13 +90,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB0_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_constant:
@@ -118,13 +118,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: .LBB0_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_constant:
@@ -145,13 +146,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: .LBB0_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_constant:
@@ -174,14 +176,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB0_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -205,14 +207,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB0_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -237,14 +239,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB0_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -268,14 +270,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB0_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -335,14 +337,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB1_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -365,14 +367,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB1_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_uniform:
@@ -395,13 +397,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W64-NEXT: .LBB1_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3]
+; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1]
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_uniform:
@@ -423,13 +426,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W32-NEXT: .LBB1_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5]
+; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_uniform:
@@ -453,14 +457,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB1_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
+; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1]
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -485,14 +489,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX11W32-NEXT: .LBB1_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
+; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[0:1]
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -518,14 +522,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB1_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
+; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[0:1]
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -550,14 +554,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB1_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5]
+; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[0:1]
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -611,13 +615,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB2_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -652,13 +656,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB2_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v1
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_varying_vdata:
@@ -692,13 +696,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: .LBB2_4:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W64-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_varying_vdata:
@@ -731,13 +736,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: .LBB2_4:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W32-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_varying_vdata:
@@ -773,14 +779,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB2_4:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -816,13 +822,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB2_4:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
+; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -860,14 +866,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB2_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -903,13 +909,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB2_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
+; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -937,12 +943,12 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX8-LABEL: add_i32_varying_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 offen glc
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -950,51 +956,54 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX9-LABEL: add_i32_varying_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: add_i32_varying_offset:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: add_i32_varying_offset:
; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: add_i32_varying_offset:
; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 1
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1053,14 +1062,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB4_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1082,14 +1091,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB4_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_constant:
@@ -1111,14 +1120,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: .LBB4_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_constant:
@@ -1139,14 +1149,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: .LBB4_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_constant:
@@ -1169,15 +1180,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB4_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1201,15 +1212,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB4_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1234,15 +1245,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB4_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -1266,15 +1277,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB4_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1334,14 +1345,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB5_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1364,14 +1375,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB5_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_uniform:
@@ -1394,14 +1405,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W64-NEXT: .LBB5_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_uniform:
@@ -1423,14 +1434,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W32-NEXT: .LBB5_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_uniform:
@@ -1454,15 +1465,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB5_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1487,15 +1498,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX11W32-NEXT: .LBB5_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1521,15 +1532,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB5_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -1554,15 +1565,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB5_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1616,13 +1627,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB6_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1657,13 +1668,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB6_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v1
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_varying_vdata:
@@ -1697,13 +1708,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: .LBB6_4:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W64-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_varying_vdata:
@@ -1736,13 +1748,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: .LBB6_4:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W32-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_varying_vdata:
@@ -1778,14 +1791,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB6_4:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1821,14 +1834,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB6_4:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1866,14 +1879,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB6_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -1909,14 +1922,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB6_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W32-NEXT: v_mov_b32_e32 v0, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1944,12 +1957,12 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX8-LABEL: sub_i32_varying_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 offen glc
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -1957,51 +1970,54 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX9-LABEL: sub_i32_varying_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sub_i32_varying_offset:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: sub_i32_varying_offset:
; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: sub_i32_varying_offset:
; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 1
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 7e15c07..783c5d4 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -64,13 +64,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB0_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -93,13 +93,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB0_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_constant:
@@ -122,13 +122,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: .LBB0_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_constant:
@@ -150,13 +151,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: .LBB0_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_constant:
@@ -180,14 +182,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB0_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -212,14 +214,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], 0 idxen glc
; GFX11W32-NEXT: .LBB0_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -245,14 +247,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB0_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -276,14 +278,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB0_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -345,14 +347,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB1_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -376,14 +378,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB1_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_uniform:
@@ -407,13 +409,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W64-NEXT: .LBB1_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3]
+; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1]
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_uniform:
@@ -436,13 +439,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W32-NEXT: .LBB1_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5]
+; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_uniform:
@@ -467,14 +471,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB1_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
+; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1]
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -500,14 +504,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
; GFX11W32-NEXT: .LBB1_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
+; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[0:1]
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -534,14 +538,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB1_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
+; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[0:1]
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -566,14 +570,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB1_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5]
+; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[0:1]
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -629,13 +633,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB2_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -671,13 +675,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB2_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v1
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_varying_vdata:
@@ -712,13 +716,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: .LBB2_4:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W64-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_varying_vdata:
@@ -752,13 +757,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: .LBB2_4:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W32-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_varying_vdata:
@@ -795,14 +801,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB2_4:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -839,13 +845,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], 0 idxen glc
; GFX11W32-NEXT: .LBB2_4:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
+; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -884,14 +890,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB2_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -928,13 +934,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB2_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
+; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -962,12 +968,12 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add
; GFX8-LABEL: add_i32_varying_vindex:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 idxen glc
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -975,51 +981,54 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add
; GFX9-LABEL: add_i32_varying_vindex:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: add_i32_varying_vindex:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: add_i32_varying_vindex:
; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 idxen glc
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: add_i32_varying_vindex:
; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 1
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1053,13 +1062,12 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX8-NEXT: s_mov_b32 s2, 0
; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -1070,13 +1078,13 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX9-NEXT: s_mov_b32 s2, 0
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v2, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: add_i32_varying_offset:
@@ -1085,13 +1093,13 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX10-NEXT: s_mov_b32 s2, 0
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v2, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_varying_offset:
@@ -1100,13 +1108,13 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX11W64-NEXT: s_mov_b32 s2, 0
; GFX11W64-NEXT: v_mov_b32_e32 v1, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, s2
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: v_mov_b32_e32 v2, 1
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], 0 idxen offen glc
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1118,41 +1126,43 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s2
; GFX11W32-NEXT: v_mov_b32_e32 v2, 1
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], 0 idxen offen glc
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
;
; GFX12W64-LABEL: add_i32_varying_offset:
; GFX12W64: ; %bb.0: ; %entry
+; GFX12W64-NEXT: s_clause 0x1
; GFX12W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: v_mov_b32_e32 v1, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: v_mov_b32_e32 v2, 1
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
;
; GFX12W32-LABEL: add_i32_varying_offset:
; GFX12W32: ; %bb.0: ; %entry
+; GFX12W32-NEXT: s_clause 0x1
; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0
; GFX12W32-NEXT: v_mov_b32_e32 v2, 1
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1213,14 +1223,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB5_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1243,14 +1253,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB5_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_constant:
@@ -1273,14 +1283,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: .LBB5_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_constant:
@@ -1302,14 +1313,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: .LBB5_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_constant:
@@ -1333,15 +1345,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB5_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1366,15 +1378,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], 0 idxen glc
; GFX11W32-NEXT: .LBB5_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1400,15 +1412,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB5_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -1432,15 +1444,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB5_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1502,14 +1514,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB6_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1533,14 +1545,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB6_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_uniform:
@@ -1564,14 +1576,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W64-NEXT: .LBB6_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_uniform:
@@ -1594,14 +1606,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W32-NEXT: .LBB6_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_uniform:
@@ -1626,15 +1638,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB6_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1660,15 +1672,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc
; GFX11W32-NEXT: .LBB6_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1695,15 +1707,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB6_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -1728,15 +1740,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB6_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1792,13 +1804,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB7_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1834,13 +1846,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB7_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v1
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_varying_vdata:
@@ -1875,13 +1887,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: .LBB7_4:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W64-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_varying_vdata:
@@ -1915,13 +1928,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: .LBB7_4:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W32-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_varying_vdata:
@@ -1958,14 +1972,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB7_4:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -2002,14 +2016,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, v2, s[4:7], 0 idxen glc
; GFX11W32-NEXT: .LBB7_4:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -2048,14 +2062,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB7_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -2092,14 +2106,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB7_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W32-NEXT: v_mov_b32_e32 v0, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -2127,12 +2141,12 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add
; GFX8-LABEL: sub_i32_varying_vindex:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 idxen glc
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -2140,51 +2154,54 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add
; GFX9-LABEL: sub_i32_varying_vindex:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sub_i32_varying_vindex:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: sub_i32_varying_vindex:
; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 idxen glc
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: sub_i32_varying_vindex:
; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 1
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2218,13 +2235,12 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX8-NEXT: s_mov_b32 s2, 0
; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -2235,13 +2251,13 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX9-NEXT: s_mov_b32 s2, 0
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v2, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sub_i32_varying_offset:
@@ -2250,13 +2266,13 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX10-NEXT: s_mov_b32 s2, 0
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v2, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_varying_offset:
@@ -2265,13 +2281,13 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX11W64-NEXT: s_mov_b32 s2, 0
; GFX11W64-NEXT: v_mov_b32_e32 v1, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, s2
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: v_mov_b32_e32 v2, 1
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], 0 idxen offen glc
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -2283,41 +2299,43 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s2
; GFX11W32-NEXT: v_mov_b32_e32 v2, 1
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], 0 idxen offen glc
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
;
; GFX12W64-LABEL: sub_i32_varying_offset:
; GFX12W64: ; %bb.0: ; %entry
+; GFX12W64-NEXT: s_clause 0x1
; GFX12W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: v_mov_b32_e32 v1, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: v_mov_b32_e32 v2, 1
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
;
; GFX12W32-LABEL: sub_i32_varying_offset:
; GFX12W32: ; %bb.0: ; %entry
+; GFX12W32-NEXT: s_clause 0x1
; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0
; GFX12W32-NEXT: v_mov_b32_e32 v2, 1
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
index ad6009e..d74623a 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
@@ -59,12 +59,12 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s6
; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b32 v[0:1], v2
; GFX12-SDAG-NEXT: s_endpgm
@@ -73,12 +73,12 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v1, s5
; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: flat_store_b32 v[0:1], v2
; GFX12-GISEL-NEXT: s_endpgm
@@ -140,14 +140,14 @@ entry:
define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) {
; GFX12-SDAG-LABEL: global_atomic_cond_sub_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[4:5] offset:16 th:TH_ATOMIC_RETURN
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -156,12 +156,12 @@ define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[4:5] offset:16 th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -175,21 +175,21 @@ entry:
define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) {
; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16
+; GFX12-SDAG-NEXT: s_add_co_i32 s0, s2, -16
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s0
; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, -16
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s0
; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1
; GFX12-GISEL-NEXT: s_endpgm
entry:
@@ -201,21 +201,21 @@ entry:
define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32_forced:
; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16
+; GFX12-SDAG-NEXT: s_add_co_i32 s0, s2, -16
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s0
; GFX12-SDAG-NEXT: ds_cond_sub_u32 v0, v1
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32_forced:
; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, -16
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s0
; GFX12-GISEL-NEXT: ds_cond_sub_u32 v0, v1
; GFX12-GISEL-NEXT: s_endpgm
entry:
@@ -227,22 +227,22 @@ entry:
define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) {
; GFX12-SDAG-LABEL: ds_cond_sub_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1 offset:16
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s6
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: ds_store_b32 v1, v0
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: ds_cond_sub_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s4
; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v1, v0 offset:16
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: ds_store_b32 v1, v0
; GFX12-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/bfe-combine.ll b/llvm/test/CodeGen/AMDGPU/bfe-combine.ll
index 0f20ed1..1b277c0 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe-combine.ll
@@ -6,36 +6,36 @@
define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x) {
; VI-LABEL: bfe_combine8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-NEXT: v_bfe_u32 v0, v0, 8, 8
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; VI-SDWA-LABEL: bfe_combine8:
; VI-SDWA: ; %bb.0:
-; VI-SDWA-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDWA-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDWA-NEXT: v_mov_b32_e32 v1, 2
; VI-SDWA-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-SDWA-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; VI-SDWA-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-SDWA-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-SDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDWA-NEXT: flat_load_dword v2, v[0:1]
-; VI-SDWA-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDWA-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDWA-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDWA-NEXT: v_mov_b32_e32 v1, s3
; VI-SDWA-NEXT: s_waitcnt vmcnt(0)
; VI-SDWA-NEXT: flat_store_dword v[0:1], v2
; VI-SDWA-NEXT: s_endpgm
@@ -71,40 +71,40 @@ define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x)
define amdgpu_kernel void @bfe_combine16(ptr addrspace(1) nocapture %arg, i32 %x) {
; VI-LABEL: bfe_combine16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-NEXT: v_bfe_u32 v0, v0, 16, 16
; VI-NEXT: v_lshlrev_b32_e32 v0, 15, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; VI-SDWA-LABEL: bfe_combine16:
; VI-SDWA: ; %bb.0:
-; VI-SDWA-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDWA-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDWA-NEXT: v_mov_b32_e32 v1, 15
; VI-SDWA-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-SDWA-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-SDWA-NEXT: v_mov_b32_e32 v1, 0
; VI-SDWA-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; VI-SDWA-NEXT: v_mov_b32_e32 v2, s1
-; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-SDWA-NEXT: v_mov_b32_e32 v2, s3
+; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-SDWA-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
; VI-SDWA-NEXT: flat_load_dword v2, v[0:1]
-; VI-SDWA-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDWA-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDWA-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDWA-NEXT: v_mov_b32_e32 v1, s3
; VI-SDWA-NEXT: s_waitcnt vmcnt(0)
; VI-SDWA-NEXT: flat_store_dword v[0:1], v2
; VI-SDWA-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
index 1639ec6..15cd6f7 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
@@ -23,18 +23,18 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: v_ubfe_sub_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_bfe_u32 v2, v3, 0, v4
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -78,18 +78,18 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
;
; VI-LABEL: v_ubfe_sub_multi_use_shl_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
@@ -221,18 +221,18 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: v_sbfe_sub_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_bfe_i32 v2, v3, 0, v4
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -276,18 +276,18 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
;
; VI-LABEL: v_sbfe_sub_multi_use_shl_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
@@ -418,14 +418,14 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out,
; VI-LABEL: s_sbfe_or_shl_shl_uniform_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_or_b32 s0, s2, s0
+; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: s_bfe_i32 s0, s0, 0xf0000
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -463,16 +463,16 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %ou
; VI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s1, s2, 17
-; VI-NEXT: s_lshl_b32 s0, s0, 19
-; VI-NEXT: s_or_b32 s0, s1, s0
+; VI-NEXT: s_lshl_b32 s0, s0, 17
+; VI-NEXT: s_lshl_b32 s1, s1, 19
+; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: s_ashr_i32 s0, s0, 17
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -510,16 +510,16 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_toosmall_i32(ptr addrspace(1) %out,
; VI-LABEL: s_sbfe_or_shl_shl_toosmall_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s1, s2, 17
-; VI-NEXT: s_lshl_b32 s0, s0, 16
-; VI-NEXT: s_or_b32 s0, s1, s0
+; VI-NEXT: s_lshl_b32 s0, s0, 17
+; VI-NEXT: s_lshl_b32 s1, s1, 16
+; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: s_ashr_i32 s0, s0, 17
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
index 7b8eacc..31b5b16 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
@@ -1426,11 +1426,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
; GFX8-LABEL: s_bitselect_i64_pat_0:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7]
-; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT: s_and_b64 s[0:1], s[4:5], s[6:7]
+; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX8-NEXT: s_add_u32 s0, s0, 10
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -1442,11 +1442,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7]
-; GFX10-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX10-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-NEXT: s_and_b64 s[0:1], s[4:5], s[6:7]
+; GFX10-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX10-NEXT: s_add_u32 s0, s0, 10
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
@@ -1457,11 +1457,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
; GFX8-GISEL-LABEL: s_bitselect_i64_pat_0:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7]
-; GFX8-GISEL-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-GISEL-NEXT: s_and_b64 s[0:1], s[4:5], s[6:7]
+; GFX8-GISEL-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1473,11 +1473,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7]
-; GFX10-GISEL-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[4:5], s[6:7]
+; GFX10-GISEL-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1514,11 +1514,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
; GFX8-LABEL: s_bitselect_i64_pat_1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
-; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX8-NEXT: s_add_u32 s0, s0, 10
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -1530,11 +1530,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
-; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX10-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX10-NEXT: s_add_u32 s0, s0, 10
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
@@ -1545,11 +1545,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
; GFX8-GISEL-LABEL: s_bitselect_i64_pat_1:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
-; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX8-GISEL-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1561,11 +1561,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
-; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1602,11 +1602,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
; GFX8-LABEL: s_bitselect_i64_pat_2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
-; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX8-NEXT: s_add_u32 s0, s0, 10
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -1618,11 +1618,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
-; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX10-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX10-NEXT: s_add_u32 s0, s0, 10
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
@@ -1633,11 +1633,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
; GFX8-GISEL-LABEL: s_bitselect_i64_pat_2:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
-; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX8-GISEL-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1649,11 +1649,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
-; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1691,12 +1691,12 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
; GFX8-LABEL: s_bfi_sha256_ma_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b64 s[2:3], s[4:5], s[0:1]
-; GFX8-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
-; GFX8-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1]
-; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT: s_and_b64 s[0:1], s[4:5], s[2:3]
+; GFX8-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
+; GFX8-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
+; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX8-NEXT: s_add_u32 s0, s0, 10
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -1708,12 +1708,12 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_or_b64 s[2:3], s[4:5], s[0:1]
-; GFX10-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1]
-; GFX10-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
-; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT: s_or_b64 s[0:1], s[4:5], s[2:3]
+; GFX10-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
+; GFX10-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1]
+; GFX10-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX10-NEXT: s_add_u32 s0, s0, 10
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
@@ -1724,12 +1724,12 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
; GFX8-GISEL-LABEL: s_bfi_sha256_ma_i64:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[0:1]
-; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
-; GFX8-GISEL-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1]
-; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-GISEL-NEXT: s_and_b64 s[0:1], s[4:5], s[2:3]
+; GFX8-GISEL-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
+; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
+; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1741,12 +1741,12 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_or_b64 s[2:3], s[4:5], s[0:1]
-; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1]
-; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
-; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[4:5], s[2:3]
+; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
+; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1]
+; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/bfm.ll b/llvm/test/CodeGen/AMDGPU/bfm.ll
index 8b2f66b..935909e 100644
--- a/llvm/test/CodeGen/AMDGPU/bfm.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfm.ll
@@ -48,13 +48,13 @@ define amdgpu_kernel void @s_bfm_pattern_simple(ptr addrspace(1) %out, i32 %x) #
;
; VI-LABEL: s_bfm_pattern_simple:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfm_b32 s2, s2, 0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_bfm_b32 s0, s4, 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%a = shl i32 1, %x
diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
index 49ec09d..6c4791d 100644
--- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
@@ -34,41 +34,41 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) #
;
; FLAT-LABEL: s_brev_i16:
; FLAT: ; %bb.0:
-; FLAT-NEXT: s_load_dword s4, s[0:1], 0x2c
-; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; FLAT-NEXT: s_mov_b32 s3, 0xf000
-; FLAT-NEXT: s_mov_b32 s2, -1
+; FLAT-NEXT: s_load_dword s2, s[0:1], 0x2c
+; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; FLAT-NEXT: s_mov_b32 s7, 0xf000
+; FLAT-NEXT: s_mov_b32 s6, -1
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
-; FLAT-NEXT: s_brev_b32 s4, s4
-; FLAT-NEXT: s_lshr_b32 s4, s4, 16
-; FLAT-NEXT: v_mov_b32_e32 v0, s4
-; FLAT-NEXT: buffer_store_short v0, off, s[0:3], 0
+; FLAT-NEXT: s_brev_b32 s0, s2
+; FLAT-NEXT: s_lshr_b32 s0, s0, 16
+; FLAT-NEXT: v_mov_b32_e32 v0, s0
+; FLAT-NEXT: buffer_store_short v0, off, s[4:7], 0
; FLAT-NEXT: s_endpgm
;
; GISEL-LABEL: s_brev_i16:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: s_and_b32 s2, s2, 0xffff
-; GISEL-NEXT: s_brev_b32 s2, s2
-; GISEL-NEXT: s_lshr_b32 s2, s2, 16
-; GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-NEXT: s_and_b32 s0, s4, 0xffff
+; GISEL-NEXT: s_brev_b32 s0, s0
+; GISEL-NEXT: s_lshr_b32 s0, s0, 16
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: v_mov_b32_e32 v1, s3
; GISEL-NEXT: flat_store_short v[0:1], v2
; GISEL-NEXT: s_endpgm
;
; GFX11-FLAT-LABEL: s_brev_i16:
; GFX11-FLAT: ; %bb.0:
; GFX11-FLAT-NEXT: s_clause 0x1
-; GFX11-FLAT-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLAT-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-FLAT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLAT-NEXT: s_brev_b32 s2, s2
+; GFX11-FLAT-NEXT: s_brev_b32 s0, s4
; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-FLAT-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1]
+; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-FLAT-NEXT: global_store_d16_hi_b16 v0, v1, s[2:3]
; GFX11-FLAT-NEXT: s_nop 0
; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLAT-NEXT: s_endpgm
@@ -76,17 +76,17 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) #
; GFX11-GISEL-LABEL: s_brev_i16:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_clause 0x1
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-GISEL-NEXT: s_and_b32 s0, s4, 0xffff
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: s_brev_b32 s2, s2
-; GFX11-GISEL-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-GISEL-NEXT: s_brev_b32 s0, s0
+; GFX11-GISEL-NEXT: s_lshr_b32 s0, s0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-GISEL-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-GISEL-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -199,25 +199,25 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) #
;
; FLAT-LABEL: s_brev_i32:
; FLAT: ; %bb.0:
-; FLAT-NEXT: s_load_dword s4, s[0:1], 0x2c
-; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; FLAT-NEXT: s_mov_b32 s3, 0xf000
-; FLAT-NEXT: s_mov_b32 s2, -1
+; FLAT-NEXT: s_load_dword s2, s[0:1], 0x2c
+; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; FLAT-NEXT: s_mov_b32 s7, 0xf000
+; FLAT-NEXT: s_mov_b32 s6, -1
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
-; FLAT-NEXT: s_brev_b32 s4, s4
-; FLAT-NEXT: v_mov_b32_e32 v0, s4
-; FLAT-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; FLAT-NEXT: s_brev_b32 s0, s2
+; FLAT-NEXT: v_mov_b32_e32 v0, s0
+; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0
; FLAT-NEXT: s_endpgm
;
; GISEL-LABEL: s_brev_i32:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: s_brev_b32 s2, s2
-; GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-NEXT: s_brev_b32 s0, s4
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: v_mov_b32_e32 v1, s3
; GISEL-NEXT: flat_store_dword v[0:1], v2
; GISEL-NEXT: s_endpgm
;
@@ -225,14 +225,14 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) #
; GFX11-FLAT: ; %bb.0:
; GFX11-FLAT-NEXT: s_clause 0x1
; GFX11-FLAT-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FLAT-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FLAT-NEXT: s_mov_b32 s6, -1
; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLAT-NEXT: s_brev_b32 s2, s2
+; GFX11-FLAT-NEXT: s_brev_b32 s0, s2
; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FLAT-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-FLAT-NEXT: s_mov_b32 s2, -1
-; GFX11-FLAT-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-FLAT-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-FLAT-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-FLAT-NEXT: s_nop 0
; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLAT-NEXT: s_endpgm
@@ -240,14 +240,14 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) #
; GFX11-GISEL-LABEL: s_brev_i32:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_clause 0x1
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_brev_b32 s2, s2
+; GFX11-GISEL-NEXT: s_brev_b32 s0, s4
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -702,17 +702,17 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64>
; FLAT-LABEL: s_brev_v2i64:
; FLAT: ; %bb.0:
; FLAT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; FLAT-NEXT: s_mov_b32 s3, 0xf000
-; FLAT-NEXT: s_mov_b32 s2, -1
+; FLAT-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; FLAT-NEXT: s_mov_b32 s11, 0xf000
+; FLAT-NEXT: s_mov_b32 s10, -1
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
-; FLAT-NEXT: s_brev_b64 s[6:7], s[6:7]
-; FLAT-NEXT: s_brev_b64 s[4:5], s[4:5]
-; FLAT-NEXT: v_mov_b32_e32 v0, s4
-; FLAT-NEXT: v_mov_b32_e32 v1, s5
-; FLAT-NEXT: v_mov_b32_e32 v2, s6
-; FLAT-NEXT: v_mov_b32_e32 v3, s7
-; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; FLAT-NEXT: s_brev_b64 s[0:1], s[6:7]
+; FLAT-NEXT: s_brev_b64 s[2:3], s[4:5]
+; FLAT-NEXT: v_mov_b32_e32 v0, s2
+; FLAT-NEXT: v_mov_b32_e32 v1, s3
+; FLAT-NEXT: v_mov_b32_e32 v2, s0
+; FLAT-NEXT: v_mov_b32_e32 v3, s1
+; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; FLAT-NEXT: s_endpgm
;
; GISEL-LABEL: s_brev_v2i64:
@@ -735,15 +735,15 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64>
; GFX11-FLAT: ; %bb.0:
; GFX11-FLAT-NEXT: s_clause 0x1
; GFX11-FLAT-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLAT-NEXT: s_load_b64 s[8:9], s[0:1], 0x24
+; GFX11-FLAT-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-FLAT-NEXT: s_mov_b32 s10, -1
; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLAT-NEXT: s_brev_b64 s[2:3], s[4:5]
-; GFX11-FLAT-NEXT: s_brev_b64 s[4:5], s[6:7]
-; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-FLAT-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
-; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-FLAT-NEXT: s_mov_b32 s2, -1
-; GFX11-FLAT-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-FLAT-NEXT: s_brev_b64 s[0:1], s[4:5]
+; GFX11-FLAT-NEXT: s_brev_b64 s[2:3], s[6:7]
+; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FLAT-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FLAT-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0
; GFX11-FLAT-NEXT: s_nop 0
; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLAT-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
index 3dbbb87..8bee436 100644
--- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
@@ -137,42 +137,42 @@ define amdgpu_kernel void @br_cc_f16_imm_a(
;
; VI-LABEL: br_cc_f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_nlt_f16_e32 vcc, 0.5, v0
; VI-NEXT: s_cbranch_vccnz .LBB1_2
; VI-NEXT: ; %bb.1: ; %one
; VI-NEXT: v_mov_b32_e32 v0, 0x3800
; VI-NEXT: .LBB1_2: ; %two
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: br_cc_f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s4, s2
-; GFX11-NEXT: s_mov_b32 s5, s3
-; GFX11-NEXT: buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-NEXT: s_mov_b32 s0, s6
+; GFX11-NEXT: s_mov_b32 s1, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0.5, v0
; GFX11-NEXT: s_cbranch_vccnz .LBB1_2
; GFX11-NEXT: ; %bb.1: ; %one
; GFX11-NEXT: v_mov_b32_e32 v0, 0x3800
; GFX11-NEXT: .LBB1_2: ; %two
-; GFX11-NEXT: s_mov_b32 s2, s6
-; GFX11-NEXT: s_mov_b32 s3, s7
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT: s_mov_b32 s6, s2
+; GFX11-NEXT: s_mov_b32 s7, s3
+; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -221,44 +221,44 @@ define amdgpu_kernel void @br_cc_f16_imm_b(
;
; VI-LABEL: br_cc_f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_ngt_f16_e32 vcc, 0.5, v0
; VI-NEXT: s_cbranch_vccnz .LBB2_2
; VI-NEXT: ; %bb.1: ; %one
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB2_2: ; %two
; VI-NEXT: v_mov_b32_e32 v0, 0x3800
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: br_cc_f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s4, s2
-; GFX11-NEXT: s_mov_b32 s5, s3
-; GFX11-NEXT: buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-NEXT: s_mov_b32 s0, s6
+; GFX11-NEXT: s_mov_b32 s1, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, 0.5, v0
; GFX11-NEXT: s_cbranch_vccz .LBB2_2
; GFX11-NEXT: ; %bb.1: ; %two
; GFX11-NEXT: v_mov_b32_e32 v0, 0x3800
; GFX11-NEXT: .LBB2_2: ; %one
-; GFX11-NEXT: s_mov_b32 s2, s6
-; GFX11-NEXT: s_mov_b32 s3, s7
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT: s_mov_b32 s6, s2
+; GFX11-NEXT: s_mov_b32 s7, s3
+; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 384715a..b8d9878 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -15,7 +15,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9
; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec
; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
- ; GFX90A-NEXT: renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
+ ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr15 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_LOAD_DWORDX2_IMM renamable $sgpr6_sgpr7, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll
index e4c7df3..134e76c 100644
--- a/llvm/test/CodeGen/AMDGPU/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/bswap.ll
@@ -34,29 +34,29 @@ define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: test_bswap_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dword s6, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_perm_b32 v0, 0, s2, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: v_perm_b32 v0, 0, s6, v0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_bswap_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v0, 0, s2, 0x10203
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: v_perm_b32 v0, 0, s0, 0x10203
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -87,31 +87,31 @@ define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace
;
; VI-LABEL: test_bswap_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_perm_b32 v1, 0, s3, v0
-; VI-NEXT: v_perm_b32 v0, 0, s2, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: v_perm_b32 v1, 0, s7, v0
+; VI-NEXT: v_perm_b32 v0, 0, s6, v0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_bswap_v2i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v1, 0, s5, 0x10203
-; GFX11-NEXT: v_perm_b32 v0, 0, s4, 0x10203
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: v_perm_b32 v1, 0, s1, 0x10203
+; GFX11-NEXT: v_perm_b32 v0, 0, s0, 0x10203
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -148,35 +148,35 @@ define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace
;
; VI-LABEL: test_bswap_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_perm_b32 v3, 0, s11, v0
; VI-NEXT: v_perm_b32 v2, 0, s10, v0
; VI-NEXT: v_perm_b32 v1, 0, s9, v0
; VI-NEXT: v_perm_b32 v0, 0, s8, v0
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_bswap_v4i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v3, 0, s7, 0x10203
-; GFX11-NEXT: v_perm_b32 v2, 0, s6, 0x10203
-; GFX11-NEXT: v_perm_b32 v1, 0, s5, 0x10203
-; GFX11-NEXT: v_perm_b32 v0, 0, s4, 0x10203
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT: v_perm_b32 v3, 0, s3, 0x10203
+; GFX11-NEXT: v_perm_b32 v2, 0, s2, 0x10203
+; GFX11-NEXT: v_perm_b32 v1, 0, s1, 0x10203
+; GFX11-NEXT: v_perm_b32 v0, 0, s0, 0x10203
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -296,31 +296,31 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: test_bswap_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_perm_b32 v1, 0, s2, v0
-; VI-NEXT: v_perm_b32 v0, 0, s3, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: v_perm_b32 v1, 0, s6, v0
+; VI-NEXT: v_perm_b32 v0, 0, s7, v0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_bswap_i64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v1, 0, s4, 0x10203
-; GFX11-NEXT: v_perm_b32 v0, 0, s5, 0x10203
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: v_perm_b32 v1, 0, s0, 0x10203
+; GFX11-NEXT: v_perm_b32 v0, 0, s1, 0x10203
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -357,35 +357,35 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace
;
; VI-LABEL: test_bswap_v2i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_perm_b32 v3, 0, s10, v0
; VI-NEXT: v_perm_b32 v2, 0, s11, v0
; VI-NEXT: v_perm_b32 v1, 0, s8, v0
; VI-NEXT: v_perm_b32 v0, 0, s9, v0
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_bswap_v2i64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v3, 0, s6, 0x10203
-; GFX11-NEXT: v_perm_b32 v2, 0, s7, 0x10203
-; GFX11-NEXT: v_perm_b32 v1, 0, s4, 0x10203
-; GFX11-NEXT: v_perm_b32 v0, 0, s5, 0x10203
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT: v_perm_b32 v3, 0, s2, 0x10203
+; GFX11-NEXT: v_perm_b32 v2, 0, s3, 0x10203
+; GFX11-NEXT: v_perm_b32 v1, 0, s0, 0x10203
+; GFX11-NEXT: v_perm_b32 v0, 0, s1, 0x10203
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 8d347ae..04ee81b 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -19,12 +19,12 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) {
;
; GFX8-LABEL: build_vector2:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 5
; GFX8-NEXT: v_mov_b32_e32 v1, 6
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -52,12 +52,12 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) {
;
; GFX940-LABEL: build_vector2:
; GFX940: ; %bb.0: ; %entry
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: v_mov_b32_e32 v0, 5
; GFX940-NEXT: v_mov_b32_e32 v1, 6
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
entry:
store <2 x i32> <i32 5, i32 6>, ptr addrspace(1) %out
@@ -80,14 +80,14 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) {
;
; GFX8-LABEL: build_vector4:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 5
; GFX8-NEXT: v_mov_b32_e32 v1, 6
; GFX8-NEXT: v_mov_b32_e32 v2, 7
; GFX8-NEXT: v_mov_b32_e32 v3, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
@@ -119,14 +119,14 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) {
;
; GFX940-LABEL: build_vector4:
; GFX940: ; %bb.0: ; %entry
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: v_mov_b32_e32 v0, 5
; GFX940-NEXT: v_mov_b32_e32 v1, 6
; GFX940-NEXT: v_mov_b32_e32 v2, 7
; GFX940-NEXT: v_mov_b32_e32 v3, 8
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
entry:
store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, ptr addrspace(1) %out
@@ -146,11 +146,11 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) {
;
; GFX8-LABEL: build_vector_v2i16:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 0x60005
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -176,11 +176,11 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) {
;
; GFX940-LABEL: build_vector_v2i16:
; GFX940: ; %bb.0: ; %entry
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: v_mov_b32_e32 v1, 0x60005
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
entry:
store <2 x i16> <i16 5, i16 6>, ptr addrspace(1) %out
@@ -201,14 +201,14 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32
;
; GFX8-LABEL: build_vector_v2i16_trunc:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshr_b32 s2, s2, 16
-; GFX8-NEXT: s_or_b32 s2, s2, 0x50000
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_lshr_b32 s0, s4, 16
+; GFX8-NEXT: s_or_b32 s0, s0, 0x50000
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index 00af922..d5a9607 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -1357,29 +1357,29 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) {
;
; VI-LABEL: amd_kernel_v8i8:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s2, s1, 24
-; VI-NEXT: s_lshr_b32 s3, s1, 16
-; VI-NEXT: s_add_i32 s3, s3, s3
-; VI-NEXT: s_add_i32 s2, s2, s2
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_lshr_b32 s0, s3, 24
+; VI-NEXT: s_lshr_b32 s1, s3, 16
; VI-NEXT: s_add_i32 s1, s1, s1
-; VI-NEXT: v_lshlrev_b16_e64 v2, 8, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_add_i32 s0, s0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_add_i32 s3, s3, s3
+; VI-NEXT: v_lshlrev_b16_e64 v2, 8, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_sdwa v1, vcc, v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: s_lshr_b32 s4, s0, 24
-; VI-NEXT: s_lshr_b32 s5, s0, 16
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_i32 s0, s0, s0
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_lshr_b32 s4, s2, 24
+; VI-NEXT: s_lshr_b32 s5, s2, 16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: s_add_i32 s2, s2, s2
; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_add_i32 s5, s5, s5
; VI-NEXT: s_add_i32 s4, s4, s4
; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v2, 8, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
@@ -1392,20 +1392,20 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) {
;
; GFX11-LABEL: amd_kernel_v8i8:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0
-; GFX11-NEXT: v_lshrrev_b16 v1, 8, s1
-; GFX11-NEXT: s_lshr_b32 s2, s0, 16
-; GFX11-NEXT: s_lshr_b32 s3, s0, 24
-; GFX11-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-NEXT: s_lshr_b32 s5, s1, 24
-; GFX11-NEXT: v_add_nc_u16 v2, s1, s1
-; GFX11-NEXT: v_add_nc_u16 v3, s0, s0
+; GFX11-NEXT: v_lshrrev_b16 v0, 8, s2
+; GFX11-NEXT: v_lshrrev_b16 v1, 8, s3
+; GFX11-NEXT: s_lshr_b32 s0, s2, 16
+; GFX11-NEXT: s_lshr_b32 s1, s2, 24
+; GFX11-NEXT: s_lshr_b32 s4, s3, 16
+; GFX11-NEXT: s_lshr_b32 s5, s3, 24
+; GFX11-NEXT: v_add_nc_u16 v2, s3, s3
+; GFX11-NEXT: v_add_nc_u16 v3, s2, s2
; GFX11-NEXT: v_add_nc_u16 v4, s5, s5
; GFX11-NEXT: v_add_nc_u16 v5, s4, s4
-; GFX11-NEXT: v_add_nc_u16 v6, s3, s3
-; GFX11-NEXT: v_add_nc_u16 v7, s2, s2
+; GFX11-NEXT: v_add_nc_u16 v6, s1, s1
+; GFX11-NEXT: v_add_nc_u16 v7, s0, s0
; GFX11-NEXT: v_add_nc_u16 v1, v1, v1
; GFX11-NEXT: v_add_nc_u16 v0, v0, v0
; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
@@ -1524,58 +1524,58 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) {
;
; VI-LABEL: amd_kernel_v16i8:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s4, s3, 24
-; VI-NEXT: s_lshr_b32 s5, s3, 16
-; VI-NEXT: s_add_i32 s5, s5, s5
-; VI-NEXT: s_add_i32 s4, s4, s4
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: s_add_i32 s3, s3, s3
-; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s4
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: s_lshr_b32 s6, s2, 24
-; VI-NEXT: s_lshr_b32 s7, s2, 16
+; VI-NEXT: s_lshr_b32 s0, s7, 24
+; VI-NEXT: s_lshr_b32 s1, s7, 16
+; VI-NEXT: s_add_i32 s1, s1, s1
+; VI-NEXT: s_add_i32 s0, s0, s0
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: s_add_i32 s7, s7, s7
+; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: s_lshr_b32 s2, s6, 24
+; VI-NEXT: s_lshr_b32 s3, s6, 16
; VI-NEXT: v_add_u32_sdwa v3, vcc, v3, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v5, s3
-; VI-NEXT: s_add_i32 s7, s7, s7
-; VI-NEXT: s_add_i32 s6, s6, s6
-; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v5, s7
+; VI-NEXT: s_add_i32 s3, s3, s3
; VI-NEXT: s_add_i32 s2, s2, s2
+; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: s_add_i32 s6, s6, s6
; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s6
-; VI-NEXT: v_mov_b32_e32 v5, s7
-; VI-NEXT: s_lshr_b32 s8, s1, 24
-; VI-NEXT: s_lshr_b32 s9, s1, 16
+; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s2
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_lshr_b32 s8, s5, 24
+; VI-NEXT: s_lshr_b32 s9, s5, 16
; VI-NEXT: v_add_u32_sdwa v2, vcc, v2, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v5, s2
+; VI-NEXT: v_mov_b32_e32 v5, s6
; VI-NEXT: s_add_i32 s9, s9, s9
; VI-NEXT: s_add_i32 s8, s8, s8
; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_add_i32 s1, s1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: s_add_i32 s5, s5, s5
; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s8
; VI-NEXT: v_mov_b32_e32 v5, s9
-; VI-NEXT: s_lshr_b32 s10, s0, 24
-; VI-NEXT: s_lshr_b32 s11, s0, 16
+; VI-NEXT: s_lshr_b32 s10, s4, 24
+; VI-NEXT: s_lshr_b32 s11, s4, 16
; VI-NEXT: v_add_u32_sdwa v1, vcc, v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_add_i32 s11, s11, s11
; VI-NEXT: s_add_i32 s10, s10, s10
; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_i32 s0, s0, s0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: s_add_i32 s4, s4, s4
; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s10
; VI-NEXT: v_mov_b32_e32 v5, s11
; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v6, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v4, 0
; VI-NEXT: v_mov_b32_e32 v5, 0
@@ -1585,36 +1585,36 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) {
;
; GFX11-LABEL: amd_kernel_v16i8:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s10, s3, 16
-; GFX11-NEXT: s_lshr_b32 s11, s3, 24
-; GFX11-NEXT: v_lshrrev_b16 v2, 8, s2
-; GFX11-NEXT: v_lshrrev_b16 v3, 8, s3
+; GFX11-NEXT: s_lshr_b32 s10, s7, 16
+; GFX11-NEXT: s_lshr_b32 s11, s7, 24
+; GFX11-NEXT: v_lshrrev_b16 v2, 8, s6
+; GFX11-NEXT: v_lshrrev_b16 v3, 8, s7
; GFX11-NEXT: v_add_nc_u16 v7, s11, s11
; GFX11-NEXT: v_add_nc_u16 v8, s10, s10
-; GFX11-NEXT: v_add_nc_u16 v4, s3, s3
-; GFX11-NEXT: v_add_nc_u16 v5, s2, s2
+; GFX11-NEXT: v_add_nc_u16 v4, s7, s7
+; GFX11-NEXT: v_add_nc_u16 v5, s6, s6
; GFX11-NEXT: v_add_nc_u16 v3, v3, v3
; GFX11-NEXT: v_add_nc_u16 v2, v2, v2
; GFX11-NEXT: v_lshlrev_b16 v7, 8, v7
; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT: s_lshr_b32 s7, s1, 24
-; GFX11-NEXT: v_lshrrev_b16 v1, 8, s1
-; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0
-; GFX11-NEXT: v_add_nc_u16 v11, s7, s7
+; GFX11-NEXT: s_lshr_b32 s3, s5, 24
+; GFX11-NEXT: v_lshrrev_b16 v1, 8, s5
+; GFX11-NEXT: v_lshrrev_b16 v0, 8, s4
+; GFX11-NEXT: v_add_nc_u16 v11, s3, s3
; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3
; GFX11-NEXT: v_lshlrev_b16 v2, 8, v2
; GFX11-NEXT: v_or_b32_e32 v7, v8, v7
-; GFX11-NEXT: s_lshr_b32 s6, s1, 16
-; GFX11-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-NEXT: s_lshr_b32 s5, s0, 24
-; GFX11-NEXT: s_lshr_b32 s8, s2, 16
-; GFX11-NEXT: s_lshr_b32 s9, s2, 24
-; GFX11-NEXT: v_add_nc_u16 v6, s1, s1
-; GFX11-NEXT: v_add_nc_u16 v12, s6, s6
+; GFX11-NEXT: s_lshr_b32 s2, s5, 16
+; GFX11-NEXT: s_lshr_b32 s0, s4, 16
+; GFX11-NEXT: s_lshr_b32 s1, s4, 24
+; GFX11-NEXT: s_lshr_b32 s8, s6, 16
+; GFX11-NEXT: s_lshr_b32 s9, s6, 24
+; GFX11-NEXT: v_add_nc_u16 v6, s5, s5
+; GFX11-NEXT: v_add_nc_u16 v12, s2, s2
; GFX11-NEXT: v_add_nc_u16 v1, v1, v1
; GFX11-NEXT: v_add_nc_u16 v9, s9, s9
; GFX11-NEXT: v_add_nc_u16 v10, s8, s8
@@ -1622,10 +1622,10 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) {
; GFX11-NEXT: v_or_b32_e32 v2, v5, v2
; GFX11-NEXT: v_lshlrev_b16 v4, 8, v11
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v7
-; GFX11-NEXT: v_add_nc_u16 v7, s0, s0
+; GFX11-NEXT: v_add_nc_u16 v7, s4, s4
; GFX11-NEXT: v_add_nc_u16 v0, v0, v0
-; GFX11-NEXT: v_add_nc_u16 v8, s5, s5
-; GFX11-NEXT: v_add_nc_u16 v11, s4, s4
+; GFX11-NEXT: v_add_nc_u16 v8, s1, s1
+; GFX11-NEXT: v_add_nc_u16 v11, s0, s0
; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12
@@ -1816,112 +1816,112 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
;
; VI-LABEL: amd_kernel_v32i8:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v10, 0
; VI-NEXT: v_mov_b32_e32 v11, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s8, s3, 24
-; VI-NEXT: s_lshr_b32 s9, s3, 16
-; VI-NEXT: s_add_i32 s9, s9, s9
-; VI-NEXT: s_add_i32 s8, s8, s8
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: s_add_i32 s3, s3, s3
-; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s8
-; VI-NEXT: v_mov_b32_e32 v9, s9
-; VI-NEXT: s_lshr_b32 s10, s2, 24
-; VI-NEXT: s_lshr_b32 s11, s2, 16
+; VI-NEXT: s_lshr_b32 s0, s7, 24
+; VI-NEXT: s_lshr_b32 s1, s7, 16
+; VI-NEXT: s_add_i32 s1, s1, s1
+; VI-NEXT: s_add_i32 s0, s0, s0
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: s_add_i32 s7, s7, s7
+; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s0
+; VI-NEXT: v_mov_b32_e32 v9, s1
+; VI-NEXT: s_lshr_b32 s2, s6, 24
+; VI-NEXT: s_lshr_b32 s3, s6, 16
; VI-NEXT: v_add_u32_sdwa v3, vcc, v3, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v9, s3
-; VI-NEXT: s_add_i32 s11, s11, s11
-; VI-NEXT: s_add_i32 s10, s10, s10
-; VI-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v9, s7
+; VI-NEXT: s_add_i32 s3, s3, s3
; VI-NEXT: s_add_i32 s2, s2, s2
+; VI-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: s_add_i32 s6, s6, s6
; VI-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s10
-; VI-NEXT: v_mov_b32_e32 v9, s11
-; VI-NEXT: s_lshr_b32 s12, s1, 24
-; VI-NEXT: s_lshr_b32 s13, s1, 16
+; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s2
+; VI-NEXT: v_mov_b32_e32 v9, s3
+; VI-NEXT: s_lshr_b32 s12, s5, 24
+; VI-NEXT: s_lshr_b32 s13, s5, 16
; VI-NEXT: v_add_u32_sdwa v2, vcc, v2, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v9, s2
-; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v9, s6
+; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: s_add_i32 s13, s13, s13
; VI-NEXT: s_add_i32 s12, s12, s12
; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_sdwa v4, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT: v_mov_b32_e32 v0, s5
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_add_i32 s1, s1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s9
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: s_add_i32 s5, s5, s5
; VI-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s12
; VI-NEXT: v_mov_b32_e32 v9, s13
-; VI-NEXT: s_lshr_b32 s14, s0, 24
-; VI-NEXT: s_lshr_b32 s15, s0, 16
+; VI-NEXT: s_lshr_b32 s14, s4, 24
+; VI-NEXT: s_lshr_b32 s15, s4, 16
; VI-NEXT: v_add_u32_sdwa v5, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v0, s10
; VI-NEXT: v_add_u32_sdwa v1, vcc, v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v9, s1
+; VI-NEXT: v_mov_b32_e32 v9, s5
; VI-NEXT: v_add_u32_sdwa v6, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT: v_mov_b32_e32 v0, s7
+; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: s_add_i32 s15, s15, s15
; VI-NEXT: s_add_i32 s14, s14, s14
; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_sdwa v7, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_i32 s0, s0, s0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: s_add_i32 s4, s4, s4
; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s14
; VI-NEXT: v_mov_b32_e32 v9, s15
-; VI-NEXT: s_lshr_b32 s16, s7, 24
-; VI-NEXT: s_lshr_b32 s17, s7, 16
+; VI-NEXT: s_lshr_b32 s16, s11, 24
+; VI-NEXT: s_lshr_b32 s17, s11, 16
; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v9, s0
+; VI-NEXT: v_mov_b32_e32 v9, s4
; VI-NEXT: s_add_i32 s17, s17, s17
; VI-NEXT: s_add_i32 s16, s16, s16
; VI-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_add_i32 s7, s7, s7
+; VI-NEXT: s_add_i32 s11, s11, s11
; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s16
; VI-NEXT: v_mov_b32_e32 v9, s17
-; VI-NEXT: s_lshr_b32 s18, s6, 24
-; VI-NEXT: s_lshr_b32 s19, s6, 16
+; VI-NEXT: s_lshr_b32 s18, s10, 24
+; VI-NEXT: s_lshr_b32 s19, s10, 16
; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v9, s7
+; VI-NEXT: v_mov_b32_e32 v9, s11
; VI-NEXT: s_add_i32 s19, s19, s19
; VI-NEXT: s_add_i32 s18, s18, s18
; VI-NEXT: v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_add_i32 s6, s6, s6
+; VI-NEXT: s_add_i32 s10, s10, s10
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s18
; VI-NEXT: v_mov_b32_e32 v9, s19
-; VI-NEXT: s_lshr_b32 s20, s5, 24
-; VI-NEXT: s_lshr_b32 s21, s5, 16
+; VI-NEXT: s_lshr_b32 s20, s9, 24
+; VI-NEXT: s_lshr_b32 s21, s9, 16
; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v9, s6
+; VI-NEXT: v_mov_b32_e32 v9, s10
; VI-NEXT: s_add_i32 s21, s21, s21
; VI-NEXT: s_add_i32 s20, s20, s20
; VI-NEXT: v_or_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_add_i32 s5, s5, s5
+; VI-NEXT: s_add_i32 s9, s9, s9
; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s20
; VI-NEXT: v_mov_b32_e32 v9, s21
-; VI-NEXT: s_lshr_b32 s22, s4, 24
-; VI-NEXT: s_lshr_b32 s23, s4, 16
+; VI-NEXT: s_lshr_b32 s22, s8, 24
+; VI-NEXT: s_lshr_b32 s23, s8, 16
; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v9, s5
+; VI-NEXT: v_mov_b32_e32 v9, s9
; VI-NEXT: s_add_i32 s23, s23, s23
; VI-NEXT: s_add_i32 s22, s22, s22
; VI-NEXT: v_or_b32_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_add_i32 s4, s4, s4
+; VI-NEXT: s_add_i32 s8, s8, s8
; VI-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s22
; VI-NEXT: v_mov_b32_e32 v9, s23
; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v9, s4
+; VI-NEXT: v_mov_b32_e32 v9, s8
; VI-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v8, 16
@@ -1932,39 +1932,39 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
;
; GFX11-LABEL: amd_kernel_v32i8:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b16 v3, 8, s2
-; GFX11-NEXT: v_lshrrev_b16 v7, 8, s3
-; GFX11-NEXT: s_lshr_b32 s21, s3, 16
-; GFX11-NEXT: s_lshr_b32 s22, s3, 24
-; GFX11-NEXT: v_add_nc_u16 v8, s3, s3
-; GFX11-NEXT: v_add_nc_u16 v9, s2, s2
+; GFX11-NEXT: v_lshrrev_b16 v3, 8, s6
+; GFX11-NEXT: v_lshrrev_b16 v7, 8, s7
+; GFX11-NEXT: s_lshr_b32 s21, s7, 16
+; GFX11-NEXT: s_lshr_b32 s22, s7, 24
+; GFX11-NEXT: v_add_nc_u16 v8, s7, s7
+; GFX11-NEXT: v_add_nc_u16 v9, s6, s6
; GFX11-NEXT: v_add_nc_u16 v7, v7, v7
; GFX11-NEXT: v_add_nc_u16 v10, s22, s22
; GFX11-NEXT: v_add_nc_u16 v11, s21, s21
; GFX11-NEXT: v_add_nc_u16 v3, v3, v3
-; GFX11-NEXT: v_lshrrev_b16 v2, 8, s1
+; GFX11-NEXT: v_lshrrev_b16 v2, 8, s5
; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8
; GFX11-NEXT: v_lshlrev_b16 v7, 8, v7
; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9
; GFX11-NEXT: v_lshlrev_b16 v10, 8, v10
; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT: s_lshr_b32 s18, s1, 16
-; GFX11-NEXT: s_lshr_b32 s19, s1, 24
-; GFX11-NEXT: s_lshr_b32 s20, s2, 24
-; GFX11-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-NEXT: s_lshr_b32 s18, s5, 16
+; GFX11-NEXT: s_lshr_b32 s19, s5, 24
+; GFX11-NEXT: s_lshr_b32 s20, s6, 24
+; GFX11-NEXT: s_lshr_b32 s6, s6, 16
; GFX11-NEXT: v_or_b32_e32 v7, v8, v7
; GFX11-NEXT: v_add_nc_u16 v8, s20, s20
; GFX11-NEXT: v_or_b32_e32 v10, v11, v10
; GFX11-NEXT: v_or_b32_e32 v3, v9, v3
-; GFX11-NEXT: v_add_nc_u16 v9, s2, s2
-; GFX11-NEXT: v_add_nc_u16 v11, s1, s1
+; GFX11-NEXT: v_add_nc_u16 v9, s6, s6
+; GFX11-NEXT: v_add_nc_u16 v11, s5, s5
; GFX11-NEXT: v_add_nc_u16 v2, v2, v2
; GFX11-NEXT: v_add_nc_u16 v12, s19, s19
; GFX11-NEXT: v_add_nc_u16 v13, s18, s18
-; GFX11-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX11-NEXT: v_lshrrev_b16 v1, 8, s4
; GFX11-NEXT: v_lshlrev_b16 v8, 8, v8
; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9
; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
@@ -1974,10 +1974,10 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v3
; GFX11-NEXT: v_or_b32_e32 v3, v9, v8
; GFX11-NEXT: v_or_b32_e32 v2, v11, v2
-; GFX11-NEXT: v_add_nc_u16 v9, s0, s0
+; GFX11-NEXT: v_add_nc_u16 v9, s4, s4
; GFX11-NEXT: v_or_b32_e32 v8, v13, v12
; GFX11-NEXT: v_add_nc_u16 v1, v1, v1
-; GFX11-NEXT: v_lshrrev_b16 v6, 8, s7
+; GFX11-NEXT: v_lshrrev_b16 v6, 8, s11
; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v3
@@ -1985,14 +1985,14 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9
; GFX11-NEXT: v_lshlrev_b16 v13, 8, v1
-; GFX11-NEXT: v_lshrrev_b16 v5, 8, s6
-; GFX11-NEXT: s_lshr_b32 s14, s7, 16
-; GFX11-NEXT: s_lshr_b32 s15, s7, 24
-; GFX11-NEXT: s_lshr_b32 s16, s0, 16
-; GFX11-NEXT: s_lshr_b32 s17, s0, 24
+; GFX11-NEXT: v_lshrrev_b16 v5, 8, s10
+; GFX11-NEXT: s_lshr_b32 s14, s11, 16
+; GFX11-NEXT: s_lshr_b32 s15, s11, 24
+; GFX11-NEXT: s_lshr_b32 s16, s4, 16
+; GFX11-NEXT: s_lshr_b32 s17, s4, 24
; GFX11-NEXT: v_or_b32_e32 v3, v7, v10
; GFX11-NEXT: v_or_b32_e32 v2, v14, v11
-; GFX11-NEXT: v_add_nc_u16 v7, s7, s7
+; GFX11-NEXT: v_add_nc_u16 v7, s11, s11
; GFX11-NEXT: v_or_b32_e32 v1, v12, v8
; GFX11-NEXT: v_or_b32_e32 v8, v9, v13
; GFX11-NEXT: v_add_nc_u16 v9, s17, s17
@@ -2000,7 +2000,7 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
; GFX11-NEXT: v_add_nc_u16 v6, v6, v6
; GFX11-NEXT: v_add_nc_u16 v11, s15, s15
; GFX11-NEXT: v_add_nc_u16 v12, s14, s14
-; GFX11-NEXT: v_add_nc_u16 v13, s6, s6
+; GFX11-NEXT: v_add_nc_u16 v13, s10, s10
; GFX11-NEXT: v_add_nc_u16 v5, v5, v5
; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7
; GFX11-NEXT: v_lshlrev_b16 v6, 8, v6
@@ -2008,16 +2008,16 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12
; GFX11-NEXT: v_lshlrev_b16 v9, 8, v9
; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT: v_lshrrev_b16 v0, 8, s4
-; GFX11-NEXT: v_lshrrev_b16 v4, 8, s5
+; GFX11-NEXT: v_lshrrev_b16 v0, 8, s8
+; GFX11-NEXT: v_lshrrev_b16 v4, 8, s9
; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-NEXT: v_lshlrev_b16 v5, 8, v5
-; GFX11-NEXT: s_lshr_b32 s12, s6, 16
-; GFX11-NEXT: s_lshr_b32 s13, s6, 24
-; GFX11-NEXT: s_lshr_b32 s8, s4, 16
-; GFX11-NEXT: s_lshr_b32 s9, s4, 24
-; GFX11-NEXT: s_lshr_b32 s10, s5, 16
-; GFX11-NEXT: s_lshr_b32 s11, s5, 24
+; GFX11-NEXT: s_lshr_b32 s12, s10, 16
+; GFX11-NEXT: s_lshr_b32 s13, s10, 24
+; GFX11-NEXT: s_lshr_b32 s0, s8, 16
+; GFX11-NEXT: s_lshr_b32 s1, s8, 24
+; GFX11-NEXT: s_lshr_b32 s2, s9, 16
+; GFX11-NEXT: s_lshr_b32 s3, s9, 24
; GFX11-NEXT: v_or_b32_e32 v6, v7, v6
; GFX11-NEXT: v_or_b32_e32 v7, v12, v11
; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v8
@@ -2025,14 +2025,14 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
; GFX11-NEXT: v_add_nc_u16 v9, s13, s13
; GFX11-NEXT: v_add_nc_u16 v10, s12, s12
; GFX11-NEXT: v_or_b32_e32 v5, v13, v5
-; GFX11-NEXT: v_add_nc_u16 v11, s5, s5
+; GFX11-NEXT: v_add_nc_u16 v11, s9, s9
; GFX11-NEXT: v_add_nc_u16 v4, v4, v4
-; GFX11-NEXT: v_add_nc_u16 v13, s11, s11
-; GFX11-NEXT: v_add_nc_u16 v14, s10, s10
-; GFX11-NEXT: v_add_nc_u16 v15, s4, s4
+; GFX11-NEXT: v_add_nc_u16 v13, s3, s3
+; GFX11-NEXT: v_add_nc_u16 v14, s2, s2
+; GFX11-NEXT: v_add_nc_u16 v15, s8, s8
; GFX11-NEXT: v_add_nc_u16 v0, v0, v0
-; GFX11-NEXT: v_add_nc_u16 v16, s9, s9
-; GFX11-NEXT: v_add_nc_u16 v17, s8, s8
+; GFX11-NEXT: v_add_nc_u16 v16, s1, s1
+; GFX11-NEXT: v_add_nc_u16 v17, s0, s0
; GFX11-NEXT: v_lshlrev_b16 v9, 8, v9
; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10
; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index a0499ef..8ad4535 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -35,11 +35,11 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; VI-LABEL: sadd64rr:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: s_add_u32 s0, s6, s0
-; VI-NEXT: s_addc_u32 s1, s7, s1
+; VI-NEXT: s_add_u32 s0, s6, s2
+; VI-NEXT: s_addc_u32 s1, s7, s3
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -77,11 +77,11 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX1030W32: ; %bb.0: ; %entry
; GFX1030W32-NEXT: s_clause 0x1
; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_add_u32 s0, s6, s0
-; GFX1030W32-NEXT: s_addc_u32 s1, s7, s1
+; GFX1030W32-NEXT: s_add_u32 s0, s6, s2
+; GFX1030W32-NEXT: s_addc_u32 s1, s7, s3
; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0
; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
@@ -91,11 +91,11 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX1030W64: ; %bb.0: ; %entry
; GFX1030W64-NEXT: s_clause 0x1
; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_add_u32 s0, s6, s0
-; GFX1030W64-NEXT: s_addc_u32 s1, s7, s1
+; GFX1030W64-NEXT: s_add_u32 s0, s6, s2
+; GFX1030W64-NEXT: s_addc_u32 s1, s7, s3
; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0
; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1
; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
@@ -105,10 +105,10 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s6, s0
-; GFX11-NEXT: s_addc_u32 s1, s7, s1
+; GFX11-NEXT: s_add_u32 s0, s6, s2
+; GFX11-NEXT: s_addc_u32 s1, s7, s3
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -144,74 +144,74 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) {
;
; VI-LABEL: sadd64ri:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_u32 s0, s2, 0x56789876
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_addc_u32 s1, s3, 0x1234
+; VI-NEXT: s_add_u32 s0, s6, 0x56789876
+; VI-NEXT: s_addc_u32 s1, s7, 0x1234
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: sadd64ri:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s2, s2, 0x56789876
-; GFX9-NEXT: s_addc_u32 s3, s3, 0x1234
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s6, 0x56789876
+; GFX9-NEXT: s_addc_u32 s1, s7, 0x1234
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: sadd64ri:
; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: s_add_u32 s2, s2, 0x56789876
-; GFX1010-NEXT: s_addc_u32 s3, s3, 0x1234
-; GFX1010-NEXT: v_mov_b32_e32 v0, s2
-; GFX1010-NEXT: v_mov_b32_e32 v1, s3
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT: s_add_u32 s0, s6, 0x56789876
+; GFX1010-NEXT: s_addc_u32 s1, s7, 0x1234
+; GFX1010-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: sadd64ri:
; GFX1030W32: ; %bb.0: ; %entry
-; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_add_u32 s2, s2, 0x56789876
-; GFX1030W32-NEXT: s_addc_u32 s3, s3, 0x1234
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s2
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, s3
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W32-NEXT: s_add_u32 s0, s6, 0x56789876
+; GFX1030W32-NEXT: s_addc_u32 s1, s7, 0x1234
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: sadd64ri:
; GFX1030W64: ; %bb.0: ; %entry
-; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_add_u32 s2, s2, 0x56789876
-; GFX1030W64-NEXT: s_addc_u32 s3, s3, 0x1234
-; GFX1030W64-NEXT: v_mov_b32_e32 v0, s2
-; GFX1030W64-NEXT: v_mov_b32_e32 v1, s3
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W64-NEXT: s_add_u32 s0, s6, 0x56789876
+; GFX1030W64-NEXT: s_addc_u32 s1, s7, 0x1234
+; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: sadd64ri:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s2, s2, 0x56789876
-; GFX11-NEXT: s_addc_u32 s3, s3, 0x1234
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s6, 0x56789876
+; GFX11-NEXT: s_addc_u32 s1, s7, 0x1234
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -243,66 +243,66 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) {
;
; VI-LABEL: vadd64rr:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s3
-; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v1, s0
-; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v4, s7
+; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; VI-NEXT: flat_store_dwordx2 v[1:2], v[3:4]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: vadd64rr:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: vadd64rr:
; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: v_add_co_u32 v0, s2, s2, v0
-; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT: v_add_co_u32 v0, s0, s6, v0
+; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s0, s7, 0, s0
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: vadd64rr:
; GFX1030W32: ; %bb.0: ; %entry
-; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: v_add_co_u32 v0, s2, s2, v0
-; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W32-NEXT: v_add_co_u32 v0, s0, s6, v0
+; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s0
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: vadd64rr:
; GFX1030W64: ; %bb.0: ; %entry
-; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: v_add_co_u32 v0, s[4:5], s2, v0
-; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[4:5]
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W64-NEXT: v_add_co_u32 v0, s[0:1], s6, v0
+; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s[0:1]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: vadd64rr:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0
+; GFX11-NEXT: v_add_co_u32 v0, s0, s6, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -334,65 +334,66 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
;
; VI-LABEL: vadd64ri:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x56789876, v0
; VI-NEXT: v_mov_b32_e32 v1, 0x1234
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: vadd64ri:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x56789876, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x1234
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: vadd64ri:
; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1010-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0
+; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1010-NEXT: s_mov_b32 null, 0
+; GFX1010-NEXT: v_add_co_u32 v0, s0, 0x56789876, v0
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
-; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, 0, 0x1234, s2
+; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, 0x1234, s0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: vadd64ri:
; GFX1030W32: ; %bb.0: ; %entry
-; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1030W32-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0
+; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1030W32-NEXT: v_add_co_u32 v0, s0, 0x56789876, v0
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
-; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2
+; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: vadd64ri:
; GFX1030W64: ; %bb.0: ; %entry
-; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1030W64-NEXT: v_add_co_u32 v0, s[2:3], 0x56789876, v0
+; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1030W64-NEXT: v_add_co_u32 v0, s[0:1], 0x56789876, v0
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
-; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s[2:3]
+; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s[0:1]
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: vadd64ri:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: v_add_co_u32 v0, s0, 0x56789876, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -424,12 +425,12 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI-LABEL: suaddo32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_i32 s2, s2, s3
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_add_i32 s0, s2, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -460,36 +461,36 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32: ; %bb.0:
; GFX1030W32-NEXT: s_clause 0x1
; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_add_i32 s2, s2, s3
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, s2
-; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1030W32-NEXT: s_add_i32 s0, s2, s3
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX1030W32-NEXT: global_store_dword v0, v1, s[4:5]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: suaddo32:
; GFX1030W64: ; %bb.0:
; GFX1030W64-NEXT: s_clause 0x1
; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_add_i32 s2, s2, s3
-; GFX1030W64-NEXT: v_mov_b32_e32 v1, s2
-; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1030W64-NEXT: s_add_i32 s0, s2, s3
+; GFX1030W64-NEXT: v_mov_b32_e32 v1, s0
+; GFX1030W64-NEXT: global_store_dword v0, v1, s[4:5]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: suaddo32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s2, s2, s3
+; GFX11-NEXT: s_add_i32 s0, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -534,12 +535,12 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
; VI-LABEL: uaddo32_vcc_user:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -576,42 +577,42 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
; GFX1030W32-LABEL: uaddo32_vcc_user:
; GFX1030W32: ; %bb.0:
; GFX1030W32-NEXT: s_clause 0x1
-; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: v_add_co_u32 v1, s4, s4, s5
-; GFX1030W32-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
-; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1030W32-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX1030W32-NEXT: v_add_co_u32 v1, s0, s2, s3
+; GFX1030W32-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX1030W32-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX1030W32-NEXT: global_store_byte v0, v2, s[6:7]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: uaddo32_vcc_user:
; GFX1030W64: ; %bb.0:
; GFX1030W64-NEXT: s_clause 0x1
-; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: v_add_co_u32 v1, s[4:5], s4, s5
-; GFX1030W64-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
-; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1030W64-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX1030W64-NEXT: v_add_co_u32 v1, s[0:1], s2, s3
+; GFX1030W64-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX1030W64-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX1030W64-NEXT: global_store_byte v0, v2, s[6:7]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: uaddo32_vcc_user:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v1, s4, s4, s5
+; GFX11-NEXT: v_add_co_u32 v1, s0, s2, s3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT: global_store_b8 v0, v2, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -655,19 +656,19 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
;
; VI-LABEL: suaddo64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_u32 s0, s4, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_addc_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_add_u32 s0, s8, s10
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: s_addc_u32 s1, s9, s11
+; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v6, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: flat_store_byte v[2:3], v0
@@ -675,80 +676,80 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
;
; GFX9-LABEL: suaddo64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s6, s4, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_addc_u32 s7, s5, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: s_add_u32 s0, s8, s10
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s9, s11
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
-; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX9-NEXT: global_store_byte v4, v0, s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: suaddo64:
; GFX1010: ; %bb.0:
-; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1010-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: s_add_u32 s6, s4, s6
-; GFX1010-NEXT: s_addc_u32 s7, s5, s7
-; GFX1010-NEXT: v_mov_b32_e32 v0, s6
-; GFX1010-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
-; GFX1010-NEXT: v_mov_b32_e32 v1, s7
-; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1010-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1010-NEXT: s_add_u32 s0, s8, s10
+; GFX1010-NEXT: s_addc_u32 s1, s9, s11
+; GFX1010-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9]
+; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1010-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: suaddo64:
; GFX1030W32: ; %bb.0:
-; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_add_u32 s6, s4, s6
-; GFX1030W32-NEXT: s_addc_u32 s7, s5, s7
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s6
-; GFX1030W32-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, s7
-; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W32-NEXT: s_add_u32 s0, s8, s10
+; GFX1030W32-NEXT: s_addc_u32 s1, s9, s11
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W32-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9]
+; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1030W32-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: suaddo64:
; GFX1030W64: ; %bb.0:
-; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_add_u32 s6, s4, s6
-; GFX1030W64-NEXT: s_addc_u32 s7, s5, s7
-; GFX1030W64-NEXT: v_mov_b32_e32 v0, s6
-; GFX1030W64-NEXT: v_cmp_lt_u64_e64 s[4:5], s[6:7], s[4:5]
-; GFX1030W64-NEXT: v_mov_b32_e32 v1, s7
-; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W64-NEXT: s_add_u32 s0, s8, s10
+; GFX1030W64-NEXT: s_addc_u32 s1, s9, s11
+; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W64-NEXT: v_cmp_lt_u64_e64 s[0:1], s[0:1], s[8:9]
+; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1030W64-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: suaddo64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s6, s4, s6
-; GFX11-NEXT: s_addc_u32 s7, s5, s7
-; GFX11-NEXT: v_mov_b32_e32 v0, s6
-; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: s_add_u32 s0, s8, s10
+; GFX11-NEXT: s_addc_u32 s1, s9, s11
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: global_store_b8 v2, v3, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -792,13 +793,13 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI-LABEL: vuaddo64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: v_mov_b32_e32 v6, s1
-; VI-NEXT: v_add_u32_e32 v5, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v6, s3
+; VI-NEXT: v_add_u32_e32 v5, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
-; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[5:6]
+; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[5:6]
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_mov_b32_e32 v3, s6
; VI-NEXT: v_mov_b32_e32 v4, s7
@@ -840,48 +841,48 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-LABEL: vuaddo64:
; GFX1030W32: ; %bb.0:
; GFX1030W32-NEXT: s_clause 0x1
-; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: v_add_co_u32 v0, s6, s4, v0
-; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s6
-; GFX1030W32-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1030W32-NEXT: v_add_co_u32 v0, s0, s2, v0
+; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX1030W32-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1030W32-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: vuaddo64:
; GFX1030W64: ; %bb.0:
; GFX1030W64-NEXT: s_clause 0x1
-; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: v_add_co_u32 v0, s[6:7], s4, v0
-; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s[6:7]
-; GFX1030W64-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX1030W64-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
+; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
+; GFX1030W64-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1030W64-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: vuaddo64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, s6, s4, v0
+; GFX11-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s6
-; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: global_store_b8 v2, v3, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -920,11 +921,11 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; VI-LABEL: ssub64rr:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: s_sub_u32 s0, s6, s0
-; VI-NEXT: s_subb_u32 s1, s7, s1
+; VI-NEXT: s_sub_u32 s0, s6, s2
+; VI-NEXT: s_subb_u32 s1, s7, s3
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -962,11 +963,11 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX1030W32: ; %bb.0: ; %entry
; GFX1030W32-NEXT: s_clause 0x1
; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_sub_u32 s0, s6, s0
-; GFX1030W32-NEXT: s_subb_u32 s1, s7, s1
+; GFX1030W32-NEXT: s_sub_u32 s0, s6, s2
+; GFX1030W32-NEXT: s_subb_u32 s1, s7, s3
; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0
; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
@@ -976,11 +977,11 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX1030W64: ; %bb.0: ; %entry
; GFX1030W64-NEXT: s_clause 0x1
; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_sub_u32 s0, s6, s0
-; GFX1030W64-NEXT: s_subb_u32 s1, s7, s1
+; GFX1030W64-NEXT: s_sub_u32 s0, s6, s2
+; GFX1030W64-NEXT: s_subb_u32 s1, s7, s3
; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0
; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1
; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
@@ -990,10 +991,10 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_u32 s0, s6, s0
-; GFX11-NEXT: s_subb_u32 s1, s7, s1
+; GFX11-NEXT: s_sub_u32 s0, s6, s2
+; GFX11-NEXT: s_subb_u32 s1, s7, s3
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -1029,74 +1030,74 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) {
;
; VI-LABEL: ssub64ri:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_sub_u32 s0, 0x56789876, s2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_subb_u32 s1, 0x1234, s3
+; VI-NEXT: s_sub_u32 s0, 0x56789876, s6
+; VI-NEXT: s_subb_u32 s1, 0x1234, s7
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: ssub64ri:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s2, 0x56789876, s2
-; GFX9-NEXT: s_subb_u32 s3, 0x1234, s3
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_sub_u32 s0, 0x56789876, s6
+; GFX9-NEXT: s_subb_u32 s1, 0x1234, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: ssub64ri:
; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: s_sub_u32 s2, 0x56789876, s2
-; GFX1010-NEXT: s_subb_u32 s3, 0x1234, s3
-; GFX1010-NEXT: v_mov_b32_e32 v0, s2
-; GFX1010-NEXT: v_mov_b32_e32 v1, s3
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT: s_sub_u32 s0, 0x56789876, s6
+; GFX1010-NEXT: s_subb_u32 s1, 0x1234, s7
+; GFX1010-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: ssub64ri:
; GFX1030W32: ; %bb.0: ; %entry
-; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_sub_u32 s2, 0x56789876, s2
-; GFX1030W32-NEXT: s_subb_u32 s3, 0x1234, s3
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s2
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, s3
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W32-NEXT: s_sub_u32 s0, 0x56789876, s6
+; GFX1030W32-NEXT: s_subb_u32 s1, 0x1234, s7
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: ssub64ri:
; GFX1030W64: ; %bb.0: ; %entry
-; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_sub_u32 s2, 0x56789876, s2
-; GFX1030W64-NEXT: s_subb_u32 s3, 0x1234, s3
-; GFX1030W64-NEXT: v_mov_b32_e32 v0, s2
-; GFX1030W64-NEXT: v_mov_b32_e32 v1, s3
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W64-NEXT: s_sub_u32 s0, 0x56789876, s6
+; GFX1030W64-NEXT: s_subb_u32 s1, 0x1234, s7
+; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: ssub64ri:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_u32 s2, 0x56789876, s2
-; GFX11-NEXT: s_subb_u32 s3, 0x1234, s3
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sub_u32 s0, 0x56789876, s6
+; GFX11-NEXT: s_subb_u32 s1, 0x1234, s7
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1128,66 +1129,66 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) {
;
; VI-LABEL: vsub64rr:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s3
-; VI-NEXT: v_sub_u32_e32 v3, vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v1, s0
-; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v4, s7
+; VI-NEXT: v_sub_u32_e32 v3, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
; VI-NEXT: flat_store_dwordx2 v[1:2], v[3:4]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: vsub64rr:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: vsub64rr:
; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: v_sub_co_u32 v0, s2, s2, v0
-; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s2, s3, 0, s2
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT: v_sub_co_u32 v0, s0, s6, v0
+; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s0, s7, 0, s0
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: vsub64rr:
; GFX1030W32: ; %bb.0: ; %entry
-; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: v_sub_co_u32 v0, s2, s2, v0
-; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s2
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W32-NEXT: v_sub_co_u32 v0, s0, s6, v0
+; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s0
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: vsub64rr:
; GFX1030W64: ; %bb.0: ; %entry
-; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: v_sub_co_u32 v0, s[4:5], s2, v0
-; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s[4:5]
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W64-NEXT: v_sub_co_u32 v0, s[0:1], s6, v0
+; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s[0:1]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: vsub64rr:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_sub_co_u32 v0, s2, s2, v0
+; GFX11-NEXT: v_sub_co_u32 v0, s0, s6, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1219,65 +1220,66 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) {
;
; VI-LABEL: vsub64ri:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x56789876, v0
; VI-NEXT: v_mov_b32_e32 v1, 0x1234
; VI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: vsub64ri:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, 0x56789876, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x1234
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: vsub64ri:
; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1010-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0
+; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1010-NEXT: s_mov_b32 null, 0
+; GFX1010-NEXT: v_sub_co_u32 v0, s0, 0x56789876, v0
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
-; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s2, 0x1234, 0, s2
+; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s0, 0x1234, 0, s0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: vsub64ri:
; GFX1030W32: ; %bb.0: ; %entry
-; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1030W32-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0
+; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1030W32-NEXT: v_sub_co_u32 v0, s0, 0x56789876, v0
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
-; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s2
+; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: vsub64ri:
; GFX1030W64: ; %bb.0: ; %entry
-; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1030W64-NEXT: v_sub_co_u32 v0, s[2:3], 0x56789876, v0
+; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1030W64-NEXT: v_sub_co_u32 v0, s[0:1], 0x56789876, v0
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
-; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s[2:3]
+; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s[0:1]
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: vsub64ri:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: v_sub_co_u32 v0, s0, 0x56789876, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s2
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1310,12 +1312,12 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI-LABEL: susubo32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_sub_i32 s2, s2, s3
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_sub_i32 s0, s2, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1346,36 +1348,36 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32: ; %bb.0:
; GFX1030W32-NEXT: s_clause 0x1
; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_sub_i32 s2, s2, s3
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, s2
-; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1030W32-NEXT: s_sub_i32 s0, s2, s3
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX1030W32-NEXT: global_store_dword v0, v1, s[4:5]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: susubo32:
; GFX1030W64: ; %bb.0:
; GFX1030W64-NEXT: s_clause 0x1
; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_sub_i32 s2, s2, s3
-; GFX1030W64-NEXT: v_mov_b32_e32 v1, s2
-; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1030W64-NEXT: s_sub_i32 s0, s2, s3
+; GFX1030W64-NEXT: v_mov_b32_e32 v1, s0
+; GFX1030W64-NEXT: global_store_dword v0, v1, s[4:5]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: susubo32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_i32 s2, s2, s3
+; GFX11-NEXT: s_sub_i32 s0, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1420,12 +1422,12 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
; VI-LABEL: usubo32_vcc_user:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_sub_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_sub_u32_e32 v4, vcc, s2, v4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -1462,42 +1464,42 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
; GFX1030W32-LABEL: usubo32_vcc_user:
; GFX1030W32: ; %bb.0:
; GFX1030W32-NEXT: s_clause 0x1
-; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: v_sub_co_u32 v1, s4, s4, s5
-; GFX1030W32-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
-; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1030W32-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX1030W32-NEXT: v_sub_co_u32 v1, s0, s2, s3
+; GFX1030W32-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX1030W32-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX1030W32-NEXT: global_store_byte v0, v2, s[6:7]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: usubo32_vcc_user:
; GFX1030W64: ; %bb.0:
; GFX1030W64-NEXT: s_clause 0x1
-; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: v_sub_co_u32 v1, s[4:5], s4, s5
-; GFX1030W64-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
-; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1030W64-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX1030W64-NEXT: v_sub_co_u32 v1, s[0:1], s2, s3
+; GFX1030W64-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX1030W64-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX1030W64-NEXT: global_store_byte v0, v2, s[6:7]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: usubo32_vcc_user:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_sub_co_u32 v1, s4, s4, s5
+; GFX11-NEXT: v_sub_co_u32 v1, s0, s2, s3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT: global_store_b8 v0, v2, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1541,19 +1543,19 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
;
; VI-LABEL: susubo64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_sub_u32 s0, s4, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_subb_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_sub_u32 s0, s8, s10
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: s_subb_u32 s1, s9, s11
+; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v6, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: flat_store_byte v[2:3], v0
@@ -1561,80 +1563,80 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
;
; GFX9-LABEL: susubo64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s6, s4, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_subb_u32 s7, s5, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: s_sub_u32 s0, s8, s10
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_subb_u32 s1, s9, s11
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
-; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX9-NEXT: global_store_byte v4, v0, s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: susubo64:
; GFX1010: ; %bb.0:
-; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1010-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: s_sub_u32 s6, s4, s6
-; GFX1010-NEXT: s_subb_u32 s7, s5, s7
-; GFX1010-NEXT: v_mov_b32_e32 v0, s6
-; GFX1010-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
-; GFX1010-NEXT: v_mov_b32_e32 v1, s7
-; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1010-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1010-NEXT: s_sub_u32 s0, s8, s10
+; GFX1010-NEXT: s_subb_u32 s1, s9, s11
+; GFX1010-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[8:9]
+; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1010-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: susubo64:
; GFX1030W32: ; %bb.0:
-; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_sub_u32 s6, s4, s6
-; GFX1030W32-NEXT: s_subb_u32 s7, s5, s7
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s6
-; GFX1030W32-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, s7
-; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W32-NEXT: s_sub_u32 s0, s8, s10
+; GFX1030W32-NEXT: s_subb_u32 s1, s9, s11
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W32-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[8:9]
+; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1030W32-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: susubo64:
; GFX1030W64: ; %bb.0:
-; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_sub_u32 s6, s4, s6
-; GFX1030W64-NEXT: s_subb_u32 s7, s5, s7
-; GFX1030W64-NEXT: v_mov_b32_e32 v0, s6
-; GFX1030W64-NEXT: v_cmp_gt_u64_e64 s[4:5], s[6:7], s[4:5]
-; GFX1030W64-NEXT: v_mov_b32_e32 v1, s7
-; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W64-NEXT: s_sub_u32 s0, s8, s10
+; GFX1030W64-NEXT: s_subb_u32 s1, s9, s11
+; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W64-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], s[8:9]
+; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1030W64-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: susubo64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_u32 s6, s4, s6
-; GFX11-NEXT: s_subb_u32 s7, s5, s7
-; GFX11-NEXT: v_mov_b32_e32 v0, s6
-; GFX11-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: s_sub_u32 s0, s8, s10
+; GFX11-NEXT: s_subb_u32 s1, s9, s11
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[8:9]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: global_store_b8 v2, v3, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1678,13 +1680,13 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI-LABEL: vusubo64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: v_mov_b32_e32 v6, s1
-; VI-NEXT: v_sub_u32_e32 v5, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v6, s3
+; VI-NEXT: v_sub_u32_e32 v5, vcc, s2, v0
; VI-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[5:6]
+; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[5:6]
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_mov_b32_e32 v3, s6
; VI-NEXT: v_mov_b32_e32 v4, s7
@@ -1726,48 +1728,48 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-LABEL: vusubo64:
; GFX1030W32: ; %bb.0:
; GFX1030W32-NEXT: s_clause 0x1
-; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: v_sub_co_u32 v0, s6, s4, v0
-; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, s5, 0, s6
-; GFX1030W32-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1030W32-NEXT: v_sub_co_u32 v0, s0, s2, v0
+; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX1030W32-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1030W32-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: vusubo64:
; GFX1030W64: ; %bb.0:
; GFX1030W64-NEXT: s_clause 0x1
-; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: v_sub_co_u32 v0, s[6:7], s4, v0
-; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, s5, 0, s[6:7]
-; GFX1030W64-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX1030W64-NEXT: v_sub_co_u32 v0, s[0:1], s2, v0
+; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
+; GFX1030W64-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1030W64-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: vusubo64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_sub_co_u32 v0, s6, s4, v0
+; GFX11-NEXT: v_sub_co_u32 v0, s0, s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s5, 0, s6
-; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: global_store_b8 v2, v3, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
index 84bd9b6..5c9762b 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
@@ -23,15 +23,15 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: v_clamp_add_src_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e64 v2, v3, 1.0 clamp
@@ -40,24 +40,24 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_clamp_add_src_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -95,15 +95,15 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: v_clamp_multi_use_src_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -115,29 +115,29 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_clamp_multi_use_src_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_max_f32_e64 v2, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-NEXT: global_store_dword v[0:1], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_multi_use_src_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e64 v2, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
@@ -174,15 +174,15 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad
;
; GFX8-LABEL: v_clamp_dbg_use_src_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e64 v2, v3, 1.0 clamp
@@ -191,24 +191,24 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad
;
; GFX9-LABEL: v_clamp_dbg_use_src_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_dbg_use_src_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -244,15 +244,15 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad
;
; GFX8-LABEL: v_clamp_add_neg_src_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_floor_f32_e32 v2, v3
@@ -262,27 +262,27 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad
;
; GFX9-LABEL: v_clamp_add_neg_src_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_floor_f32_e32 v1, v1
; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_neg_src_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_floor_f32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -318,15 +318,15 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: v_non_clamp_max_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -336,27 +336,27 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_non_clamp_max_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_max_f32_e32 v1, 0, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_non_clamp_max_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e32 v1, 0, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -389,15 +389,15 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out,
;
; GFX8-LABEL: v_clamp_add_src_f32_denormals:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e64 v2, v3, 1.0 clamp
@@ -406,24 +406,24 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_clamp_add_src_f32_denormals:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_f32_denormals:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -459,15 +459,15 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: v_clamp_add_src_f16_denorm:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_e64 v2, v3, 1.0 clamp
@@ -476,24 +476,24 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_clamp_add_src_f16_denorm:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_f16_denorm:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -529,15 +529,15 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou
;
; GFX8-LABEL: v_clamp_add_src_f16_no_denormals:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_e64 v2, v3, 1.0 clamp
@@ -546,24 +546,24 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_clamp_add_src_f16_no_denormals:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_f16_no_denormals:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -598,15 +598,15 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr
;
; GFX8-LABEL: v_clamp_add_src_v2f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp
@@ -616,26 +616,26 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr
;
; GFX9-LABEL: v_clamp_add_src_v2f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp
; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp
; GFX11-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -669,15 +669,15 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: v_clamp_add_src_f64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 clamp
@@ -686,24 +686,24 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_clamp_add_src_f64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 clamp
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 clamp
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -826,16 +826,16 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p
;
; GFX8-LABEL: v_clamp_add_src_v2f16_denorm:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_e64 v2, v3, 1.0 clamp
@@ -846,24 +846,24 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p
;
; GFX9-LABEL: v_clamp_add_src_v2f16_denorm:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f16_denorm:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -905,16 +905,16 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_add_src_v2f16_no_denormals:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_e64 v2, v3, 1.0 clamp
@@ -925,24 +925,24 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_add_src_v2f16_no_denormals:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f16_no_denormals:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -992,16 +992,16 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou
;
; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_e32 v2, 1.0, v3
@@ -1014,27 +1014,27 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1078,16 +1078,16 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1)
;
; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_e32 v2, 1.0, v3
@@ -1099,27 +1099,27 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1)
;
; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1165,16 +1165,16 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1)
;
; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_e64 v2, v3, 1.0 clamp
@@ -1186,27 +1186,27 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1)
;
; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1251,16 +1251,16 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o
;
; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_shuf:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_sdwa v2, v3, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -1271,27 +1271,27 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o
;
; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_shuf:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_shuf:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1334,15 +1334,15 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou
;
; GFX8-LABEL: v_no_clamp_add_src_v2f16_f32_src:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -1354,27 +1354,27 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_no_clamp_add_src_v2f16_f32_src:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_no_clamp_add_src_v2f16_f32_src:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1419,16 +1419,16 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out,
;
; GFX8-LABEL: v_no_clamp_add_packed_src_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -1440,27 +1440,27 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_no_clamp_add_packed_src_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_no_clamp_add_packed_src_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1505,16 +1505,16 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou
;
; GFX8-LABEL: v_no_clamp_add_src_v2f16_f16_src:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_ushort v2, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_e64 v2, v2, 1.0 clamp
@@ -1523,30 +1523,30 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_no_clamp_add_src_v2f16_f16_src:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v1, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v1, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_no_clamp_add_src_v2f16_f16_src:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v1, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v1, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 9472845..57e855f 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -24,15 +24,15 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -41,37 +41,37 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -105,15 +105,15 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace(
;
; GFX8-LABEL: v_clamp_neg_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, -v3, -v3 clamp
@@ -122,37 +122,37 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace(
;
; GFX9-LABEL: v_clamp_neg_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_neg_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_neg_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, -v1, -v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -187,15 +187,15 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-LABEL: v_clamp_negabs_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, -|v3|, -|v3| clamp
@@ -204,37 +204,37 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa
;
; GFX9-LABEL: v_clamp_negabs_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_negabs_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_negabs_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, -|v1|, -|v1| clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -273,15 +273,15 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: v_clamp_negzero_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e32 v2, 0.5, v3
@@ -292,43 +292,43 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_clamp_negzero_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX9-NEXT: v_max_f32_e32 v1, 0x80000000, v1
; GFX9-NEXT: v_min_f32_e32 v1, 1.0, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_negzero_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_negzero_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maxmin_num_f32 v1, v1, 0x80000000, 1.0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -367,15 +367,15 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out,
;
; GFX8-LABEL: v_clamp_negzero_maybe_snan_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3
@@ -386,43 +386,43 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_clamp_negzero_maybe_snan_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-NEXT: v_max_f32_e32 v1, 0x80000000, v1
; GFX9-NEXT: v_min_f32_e32 v1, 1.0, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_negzero_maybe_snan_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_negzero_maybe_snan_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maxmin_num_f32 v1, v1, 0x80000000, 1.0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -461,15 +461,15 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: v_clamp_multi_use_max_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3
@@ -482,31 +482,31 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_clamp_multi_use_max_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-NEXT: v_max_f32_e32 v1, 0, v1
; GFX9-NEXT: v_min_f32_e32 v2, 1.0, v1
-; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-NEXT: global_store_dword v[0:1], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_multi_use_max_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e32 v1, 0, v1
; GFX11-NEXT: v_min_f32_e32 v2, 1.0, v1
-; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
@@ -515,16 +515,16 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr
;
; GFX12-LABEL: v_clamp_multi_use_max_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f32_e32 v1, 0, v1
; GFX12-NEXT: v_min_num_f32_e32 v2, 1.0, v1
-; GFX12-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_store_b32 v[0:1], v1, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -563,15 +563,15 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_e64 v2, v3, v3 clamp
@@ -580,37 +580,37 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f16_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -645,15 +645,15 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace(
;
; GFX8-LABEL: v_clamp_neg_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_e64 v2, -v3, -v3 clamp
@@ -662,37 +662,37 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace(
;
; GFX9-LABEL: v_clamp_neg_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e64 v1, -v1, -v1 clamp
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_neg_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 clamp
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_neg_f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f16_e64 v1, -v1, -v1 clamp
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -728,15 +728,15 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-LABEL: v_clamp_negabs_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_e64 v2, -|v3|, -|v3| clamp
@@ -745,37 +745,37 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa
;
; GFX9-LABEL: v_clamp_negabs_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| clamp
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_negabs_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| clamp
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_negabs_f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f16_e64 v1, -|v1|, -|v1| clamp
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -812,15 +812,15 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_f64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp
@@ -829,37 +829,37 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_f64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_f64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f64_e64 v[0:1], v[0:1], v[0:1] clamp
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -893,15 +893,15 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace(
;
; GFX8-LABEL: v_clamp_neg_f64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
@@ -910,37 +910,37 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace(
;
; GFX9-LABEL: v_clamp_neg_f64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_neg_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_neg_f64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f64_e64 v[0:1], -v[0:1], -v[0:1] clamp
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -975,15 +975,15 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-LABEL: v_clamp_negabs_f64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
@@ -992,37 +992,37 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa
;
; GFX9-LABEL: v_clamp_negabs_f64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_negabs_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_negabs_f64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f64_e64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1060,16 +1060,16 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p
;
; GFX8-LABEL: v_clamp_med3_aby_negzero_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_brev_b32 s0, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: s_brev_b32 s0, 1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_med3_f32 v2, s0, 1.0, v3
@@ -1078,38 +1078,38 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p
;
; GFX9-LABEL: v_clamp_med3_aby_negzero_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_brev_b32 s0, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT: s_brev_b32 s2, 1
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_med3_f32 v1, s2, 1.0, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: v_med3_f32 v1, s0, 1.0, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_aby_negzero_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, 0x80000000, 1.0, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_aby_negzero_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_med3_num_f32 v1, 0x80000000, 1.0, v1
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1141,15 +1141,15 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: v_clamp_med3_aby_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -1158,37 +1158,37 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: v_clamp_med3_aby_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_aby_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_aby_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1220,15 +1220,15 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: v_clamp_med3_bay_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -1237,37 +1237,37 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: v_clamp_med3_bay_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_bay_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_bay_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1299,15 +1299,15 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: v_clamp_med3_yab_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -1316,37 +1316,37 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: v_clamp_med3_yab_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_yab_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_yab_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1378,15 +1378,15 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: v_clamp_med3_yba_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -1395,37 +1395,37 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: v_clamp_med3_yba_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_yba_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_yba_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1457,15 +1457,15 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: v_clamp_med3_ayb_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -1474,37 +1474,37 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: v_clamp_med3_ayb_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_ayb_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_ayb_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1536,15 +1536,15 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: v_clamp_med3_bya_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -1553,37 +1553,37 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: v_clamp_med3_bya_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_bya_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_bya_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1611,41 +1611,41 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) #
;
; GFX8-LABEL: v_clamp_constants_to_one_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 1.0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_constants_to_one_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 1.0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_constants_to_one_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_constants_to_one_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1670,41 +1670,41 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out)
;
; GFX8-LABEL: v_clamp_constants_to_zero_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_constants_to_zero_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_constants_to_zero_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_constants_to_zero_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1730,41 +1730,41 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out)
;
; GFX8-LABEL: v_clamp_constant_preserve_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0.5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_constant_preserve_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0.5
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_constant_preserve_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_constant_preserve_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1790,41 +1790,41 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1)
;
; GFX8-LABEL: v_clamp_constant_preserve_denorm_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fffff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_constant_preserve_denorm_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fffff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_constant_preserve_denorm_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_constant_preserve_denorm_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1849,41 +1849,41 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 {
;
; GFX8-LABEL: v_clamp_constant_qnan_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_constant_qnan_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_constant_qnan_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_constant_qnan_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1908,41 +1908,41 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 {
;
; GFX8-LABEL: v_clamp_constant_snan_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_constant_snan_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_constant_snan_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_constant_snan_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1977,15 +1977,15 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: v_clamp_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e32 v2, 0.5, v3
@@ -1995,40 +1995,40 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_clamp_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2063,15 +2063,15 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: v_clamp_f32_snan_dx10clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e64 v2, v3, 0.5 clamp
@@ -2080,37 +2080,37 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_clamp_f32_snan_dx10clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_f32_snan_dx10clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_f32_snan_dx10clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2146,15 +2146,15 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out,
;
; GFX8-LABEL: v_clamp_f32_snan_no_dx10clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3
@@ -2164,40 +2164,40 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_f32_snan_no_dx10clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2232,15 +2232,15 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace(
;
; GFX8-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -2250,40 +2250,40 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace(
;
; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2318,15 +2318,15 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -2335,37 +2335,37 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2397,15 +2397,15 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -2414,37 +2414,37 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2476,15 +2476,15 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_med3_f32 v2, v3, 0, 1.0
@@ -2493,37 +2493,37 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2555,15 +2555,15 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_med3_f32 v2, v3, 1.0, 0
@@ -2572,37 +2572,37 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, 1.0, 0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, 1.0, 0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2634,15 +2634,15 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_med3_f32 v2, 0, v3, 1.0
@@ -2651,37 +2651,37 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, 0, v1, 1.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, 0, v1, 1.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2713,15 +2713,15 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_med3_f32 v2, 1.0, v3, 0
@@ -2730,37 +2730,37 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, 1.0, v1, 0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, 1.0, v1, 0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2788,41 +2788,41 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace
;
; GFX8-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fc00000 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2848,41 +2848,41 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace
;
; GFX8-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800001
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7f800001
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0x7f800001 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2918,15 +2918,15 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX8-LABEL: v_clamp_v2f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -2937,37 +2937,37 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX9-LABEL: v_clamp_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_v2f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3011,16 +3011,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
;
; GFX8-LABEL: v_clamp_v2f16_undef_elt:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -3035,37 +3035,37 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
;
; GFX9-LABEL: v_clamp_v2f16_undef_elt:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_v2f16_undef_elt:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_v2f16_undef_elt:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3107,15 +3107,15 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add
;
; GFX8-LABEL: v_clamp_v2f16_not_zero:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_e32 v2, v3, v3
@@ -3128,45 +3128,45 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add
;
; GFX9-LABEL: v_clamp_v2f16_not_zero:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
; GFX9-NEXT: v_pk_max_f16 v1, v1, 2.0
; GFX9-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_v2f16_not_zero:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, 2.0
; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_v2f16_not_zero:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, 2.0
; GFX12-NEXT: v_pk_min_num_f16 v1, v1, 1.0 op_sel_hi:[1,0]
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3207,15 +3207,15 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr
;
; GFX8-LABEL: v_clamp_v2f16_not_one:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_e32 v2, v3, v3
@@ -3228,45 +3228,45 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr
;
; GFX9-LABEL: v_clamp_v2f16_not_one:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
; GFX9-NEXT: v_pk_max_f16 v1, v1, 0
; GFX9-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_v2f16_not_one:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, 0
; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_v2f16_not_one:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, 0
; GFX12-NEXT: v_pk_min_num_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3307,15 +3307,15 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac
;
; GFX8-LABEL: v_clamp_neg_v2f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, -v3, -v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -3326,37 +3326,37 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac
;
; GFX9-LABEL: v_clamp_neg_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_neg_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_neg_v2f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3398,15 +3398,15 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: v_clamp_negabs_v2f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, -|v3|, -|v3| clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -3417,42 +3417,42 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: v_clamp_negabs_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_negabs_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_negabs_v2f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3496,15 +3496,15 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: v_clamp_neglo_v2f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -3515,37 +3515,37 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_clamp_neglo_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_neglo_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_neglo_v2f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3588,15 +3588,15 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: v_clamp_neghi_v2f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, -v3, -v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -3607,37 +3607,37 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_clamp_neghi_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_neghi_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_neghi_v2f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_hi:[1,1] clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3680,15 +3680,15 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr
;
; GFX8-LABEL: v_clamp_v2f16_shuffle:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3699,37 +3699,37 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr
;
; GFX9-LABEL: v_clamp_v2f16_shuffle:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_v2f16_shuffle:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_v2f16_shuffle:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3774,16 +3774,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
;
; GFX8-LABEL: v_clamp_v2f16_undef_limit_elts0:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -3798,37 +3798,37 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
;
; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_v2f16_undef_limit_elts0:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3872,16 +3872,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
;
; GFX8-LABEL: v_clamp_v2f16_undef_limit_elts1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -3896,37 +3896,37 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
;
; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_v2f16_undef_limit_elts1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3961,70 +3961,70 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad
;
; GFX8-LABEL: v_clamp_diff_source_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x8
-; GFX8-NEXT: s_add_u32 s0, s0, 12
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s6, s[6:7], 0x8
+; GFX8-NEXT: s_add_u32 s2, s4, 12
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s5
-; GFX8-NEXT: v_mov_b32_e32 v1, s2
-; GFX8-NEXT: v_add_f32_e32 v0, s4, v0
-; GFX8-NEXT: v_add_f32_e32 v1, s4, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s6
+; GFX8-NEXT: v_add_f32_e32 v0, s0, v0
+; GFX8-NEXT: v_add_f32_e32 v1, s0, v1
; GFX8-NEXT: v_max_f32_e64 v2, v0, v1 clamp
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_diff_source_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
-; GFX9-NEXT: s_load_dword s6, s[2:3], 0x8
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v1
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_add_f32_e32 v1, s0, v1
+; GFX9-NEXT: v_add_f32_e32 v2, s0, v2
; GFX9-NEXT: v_max_f32_e64 v1, v1, v2 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:12
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5] offset:12
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_diff_source_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x8
+; GFX11-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
+; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x8
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_f32_e64 v0, s4, s5
-; GFX11-NEXT: v_add_f32_e64 v1, s4, s2
+; GFX11-NEXT: v_add_f32_e64 v0, s0, s1
+; GFX11-NEXT: v_add_f32_e64 v1, s0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e64 v0, v0, v1 clamp
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] offset:12
+; GFX11-NEXT: global_store_b32 v2, v0, s[4:5] offset:12
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_diff_source_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x0
+; GFX12-NEXT: s_load_b96 s[0:2], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_f32 s2, s4, s5
-; GFX12-NEXT: s_add_f32 s3, s4, s6
+; GFX12-NEXT: s_add_f32 s1, s0, s1
+; GFX12-NEXT: s_add_f32 s0, s0, s2
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
-; GFX12-NEXT: s_max_num_f32 s2, s2, s3
-; GFX12-NEXT: v_max_num_f32_e64 v1, s2, s2 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] offset:12
+; GFX12-NEXT: s_max_num_f32 s0, s1, s0
+; GFX12-NEXT: v_max_num_f32_e64 v1, s0, s0 clamp
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5] offset:12
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
index 9c7fa15..b969573c8 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
@@ -20,14 +20,14 @@ define amdgpu_kernel void @add1(ptr addrspace(1) nocapture %arg) {
;
; GFX9-LABEL: add1:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v2, s[0:1]
+; GFX9-NEXT: global_load_dword v3, v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
bb:
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -103,14 +103,14 @@ define amdgpu_kernel void @sub1(ptr addrspace(1) nocapture %arg) {
;
; GFX9-LABEL: sub1:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v2, s[0:1]
+; GFX9-NEXT: global_load_dword v3, v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_subbrev_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
bb:
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -450,15 +450,15 @@ define amdgpu_kernel void @add_and(ptr addrspace(1) nocapture %arg) {
;
; GFX9-LABEL: add_and:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_max_u32_e32 v1, 1, v1
; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v2, s[0:1]
+; GFX9-NEXT: global_load_dword v3, v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
bb:
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -493,14 +493,14 @@ define amdgpu_kernel void @cmp_sub_sext(ptr addrspace(1) nocapture %arg) {
;
; GFX9-LABEL: cmp_sub_sext:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v2, s[0:1]
+; GFX9-NEXT: global_load_dword v3, v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
bb:
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -533,14 +533,14 @@ define amdgpu_kernel void @cmp_sub_zext(ptr addrspace(1) nocapture %arg) {
;
; GFX9-LABEL: cmp_sub_zext:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v2, s[0:1]
+; GFX9-NEXT: global_load_dword v3, v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_subbrev_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
bb:
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
index c27e446..4b266d0 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
@@ -4,13 +4,13 @@
define amdgpu_kernel void @vectorLoadCombine(ptr %in, ptr %out) {
; GCN-LABEL: vectorLoadCombine:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: flat_load_dword v2, v[0:1]
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
@@ -37,14 +37,14 @@ entry:
define amdgpu_kernel void @vectorLoadShuffle(ptr %in, ptr %out) {
; GCN-LABEL: vectorLoadShuffle:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s0, 0x7050604
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: flat_load_dword v2, v[0:1]
-; GCN-NEXT: s_mov_b32 s0, 0x7050604
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: v_perm_b32 v2, v2, v2, s0
; GCN-NEXT: flat_store_dword v[0:1], v2
diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
index e9dbce9..52b9603 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -26,17 +26,17 @@ define amdgpu_kernel void @test_copy_v4i8(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: test_copy_v4i8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
@@ -73,24 +73,23 @@ define amdgpu_kernel void @test_copy_v4i8_x2(ptr addrspace(1) %out0, ptr addrspa
; VI-LABEL: test_copy_v4i8_x2:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, s7
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
@@ -129,27 +128,27 @@ define amdgpu_kernel void @test_copy_v4i8_x3(ptr addrspace(1) %out0, ptr addrspa
;
; VI-LABEL: test_copy_v4i8_x3:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s11
+; VI-NEXT: v_add_u32_e32 v0, vcc, s10, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
-; VI-NEXT: s_mov_b32 s15, s11
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
@@ -199,31 +198,30 @@ define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s15, s11
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s22, s10
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
-; VI-NEXT: s_mov_b32 s23, s11
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s20, s6
-; VI-NEXT: s_mov_b32 s21, s7
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s22, s2
+; VI-NEXT: s_mov_b32 s23, s3
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s20, s10
+; VI-NEXT: s_mov_b32 s21, s11
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0
; VI-NEXT: buffer_store_dword v0, off, s[16:19], 0
; VI-NEXT: buffer_store_dword v0, off, s[20:23], 0
@@ -280,22 +278,21 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr
; VI-LABEL: test_copy_v4i8_extra_use:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, s7
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: v_and_b32_e32 v4, 0xffffff00, v1
@@ -310,7 +307,7 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr
; VI-NEXT: v_add_u16_e32 v2, 0x900, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v1, v2, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
@@ -365,23 +362,23 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, p
;
; VI-LABEL: test_copy_v4i8_x2_extra_use:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s11
+; VI-NEXT: v_add_u32_e32 v0, vcc, s10, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s15, s11
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: v_and_b32_e32 v4, 0xffffff00, v1
@@ -396,9 +393,9 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, p
; VI-NEXT: v_add_u16_e32 v2, 0x900, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v1, v2, v1
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v1, off, s[12:15], 0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%in.ptr = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
@@ -433,19 +430,19 @@ define amdgpu_kernel void @test_copy_v3i8_align4(ptr addrspace(1) %out, ptr addr
;
; VI-LABEL: test_copy_v3i8_align4:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
-; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <3 x i8>, ptr addrspace(1) %in, i32 %tid.x
@@ -477,22 +474,22 @@ define amdgpu_kernel void @test_copy_v3i8_align2(ptr addrspace(1) %out, ptr addr
;
; VI-LABEL: test_copy_v3i8_align2:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:2
+; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: buffer_store_short v1, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v1, off, s[0:3], 0
; VI-NEXT: s_endpgm
%val = load <3 x i8>, ptr addrspace(1) %in, align 2
store <3 x i8> %val, ptr addrspace(1) %out, align 2
@@ -525,24 +522,24 @@ define amdgpu_kernel void @test_copy_v3i8_align1(ptr addrspace(1) %out, ptr addr
;
; VI-LABEL: test_copy_v3i8_align1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2
+; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
; VI-NEXT: v_lshrrev_b16_e32 v0, 8, v0
-; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:1
+; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:1
; VI-NEXT: s_endpgm
%val = load <3 x i8>, ptr addrspace(1) %in, align 1
store <3 x i8> %val, ptr addrspace(1) %out, align 1
@@ -569,19 +566,19 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_load(ptr addrspace(1) %out, p
;
; VI-LABEL: test_copy_v4i8_volatile_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%val = load volatile <4 x i8>, ptr addrspace(1) %in, align 4
store <4 x i8> %val, ptr addrspace(1) %out, align 4
@@ -618,28 +615,28 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_store(ptr addrspace(1) %out,
;
; VI-LABEL: test_copy_v4i8_volatile_store:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:3
; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2
; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:1
; VI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:3
+; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2
+; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:1
+; VI-NEXT: buffer_store_byte v2, off, s[0:3], 0 offset:1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v3, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v3, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%val = load <4 x i8>, ptr addrspace(1) %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll
index 7dd95a0..f10fe68 100644
--- a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll
@@ -8,21 +8,21 @@ define amdgpu_kernel void @copy_to_scc(ptr addrspace(1) %out, ptr addrspace(1) %
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:252
-; GCN-NEXT: s_load_dword s2, s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
-; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], vcc
-; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec
-; GCN-NEXT: s_cselect_b32 s2, 2, 3
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dword s0, s[6:7], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
+; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT: s_cselect_b32 s0, 2, 3
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: global_store_dword v1, v0, s[4:5]
; GCN-NEXT: s_endpgm
entry: ; preds = %1009
%0 = load i32, ptr addrspace(1) %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 332b601..848ac3b 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -36,15 +36,15 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n
;
; VI-LABEL: s_ctlz_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_flbit_i32_b32 s4, s4
-; VI-NEXT: s_min_u32 s4, s4, 32
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_flbit_i32_b32 s0, s2
+; VI-NEXT: s_min_u32 s0, s0, 32
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_ctlz_i32:
@@ -88,14 +88,14 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n
; GFX11-LABEL: s_ctlz_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clz_i32_u32 s2, s2
+; GFX11-NEXT: s_clz_i32_u32 s0, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s2, s2, 32
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_min_u32 s0, s0, 32
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -612,16 +612,16 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32],
;
; VI-LABEL: s_ctlz_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_flbit_i32_b64 s4, s[4:5]
-; VI-NEXT: s_min_u32 s4, s4, 64
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_flbit_i32_b64 s0, s[2:3]
+; VI-NEXT: s_min_u32 s0, s0, 64
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_ctlz_i64:
@@ -674,13 +674,13 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32],
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clz_i32_u64 s2, s[2:3]
+; GFX11-NEXT: s_clz_i32_u64 s0, s[2:3]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s2, s2, 64
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX11-NEXT: s_min_u32 s0, s0, 64
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 97529b5..2dd3a7b 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -41,13 +41,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out,
;
; VI-LABEL: s_ctlz_zero_undef_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_flbit_i32_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_flbit_i32_b32 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -331,14 +331,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
;
; VI-LABEL: s_ctlz_zero_undef_i8_with_select:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s2, s2, 24
-; VI-NEXT: s_flbit_i32_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_lshl_b32 s0, s4, 24
+; VI-NEXT: s_flbit_i32_b32 s0, s0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -405,15 +405,15 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
;
; VI-LABEL: s_ctlz_zero_undef_i16_with_select:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-NEXT: s_flbit_i32_b32 s2, s2
-; VI-NEXT: s_add_i32 s2, s2, -16
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0xffff
+; VI-NEXT: s_flbit_i32_b32 s0, s0
+; VI-NEXT: s_add_i32 s0, s0, -16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -479,13 +479,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
;
; VI-LABEL: s_ctlz_zero_undef_i32_with_select:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_flbit_i32_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_flbit_i32_b32 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1198,13 +1198,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out,
; VI-LABEL: s_ctlz_zero_undef_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_flbit_i32_b64 s2, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_flbit_i32_b64 s0, s[2:3]
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -2218,19 +2218,19 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out,
;
; VI-LABEL: s_ctlz_zero_undef_i18:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x3ffff
-; VI-NEXT: s_flbit_i32_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_i32 s2, s2, -14
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: s_add_u32 s0, s0, 2
+; VI-NEXT: s_and_b32 s0, s4, 0x3ffff
+; VI-NEXT: s_flbit_i32_b32 s0, s0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: s_add_i32 s4, s0, -14
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: s_add_u32 s0, s2, 2
; VI-NEXT: flat_store_short v[0:1], v2
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: s_bfe_u32 s2, s2, 0x20010
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: s_bfe_u32 s2, s4, 0x20010
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index 4f2bde8..6e39b83 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -27,15 +27,15 @@ define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val)
;
; VI-LABEL: s_ctpop_i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s4, s4, 0xffff
-; VI-NEXT: s_bcnt1_i32_b32 s4, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: s_and_b32 s0, s2, 0xffff
+; VI-NEXT: s_bcnt1_i32_b32 s0, s0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_ctpop_i16:
@@ -167,14 +167,14 @@ define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out,
; VI-LABEL: v_ctpop_add_chain_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1423,15 +1423,15 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %ou
; VI-LABEL: v_ctpop_i16_add_vvar_inv:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: s_mov_b32 s7, 0xf000
@@ -1521,29 +1521,29 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
;
; VI-LABEL: ctpop_i16_in_br:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s5, s4, 16
-; VI-NEXT: s_cmp_lg_u32 s5, 0
+; VI-NEXT: s_lshr_b32 s0, s2, 16
+; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cbranch_scc0 .LBB14_4
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2
; VI-NEXT: s_cbranch_execnz .LBB14_3
; VI-NEXT: .LBB14_2: ; %if
-; VI-NEXT: s_and_b32 s2, s4, 0xffff
-; VI-NEXT: s_bcnt1_i32_b32 s2, s2
+; VI-NEXT: s_and_b32 s0, s2, 0xffff
+; VI-NEXT: s_bcnt1_i32_b32 s0, s0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: .LBB14_3: ; %endif
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB14_4:
; VI-NEXT: ; implicit-def: $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
index 633f120..bd451dc 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -28,14 +28,14 @@ define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32],
;
; VI-LABEL: s_ctpop_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
%truncctpop = trunc i64 %ctpop to i32
@@ -116,7 +116,7 @@ define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr a
; VI-LABEL: v_ctpop_i64_user:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
@@ -128,8 +128,8 @@ define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr a
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0
; VI-NEXT: v_bcnt_u32_b32 v0, v1, v0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_or_b32_e32 v0, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_or_b32_e32 v0, s2, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -159,15 +159,15 @@ define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64
; VI-LABEL: s_ctpop_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; VI-NEXT: s_bcnt1_i32_b64 s5, s[6:7]
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; VI-NEXT: s_bcnt1_i32_b64 s1, s[6:7]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; VI-NEXT: s_endpgm
%ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
%truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
@@ -197,19 +197,19 @@ define amdgpu_kernel void @s_ctpop_v4i64(ptr addrspace(1) noalias %out, <4 x i64
; VI-LABEL: s_ctpop_v4i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s15, 0xf000
+; VI-NEXT: s_mov_b32 s14, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; VI-NEXT: s_bcnt1_i32_b64 s5, s[6:7]
-; VI-NEXT: s_bcnt1_i32_b64 s6, s[8:9]
-; VI-NEXT: s_bcnt1_i32_b64 s7, s[10:11]
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; VI-NEXT: s_bcnt1_i32_b64 s1, s[6:7]
+; VI-NEXT: s_bcnt1_i32_b64 s2, s[8:9]
+; VI-NEXT: s_bcnt1_i32_b64 s3, s[10:11]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; VI-NEXT: s_endpgm
%ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
%truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
@@ -424,15 +424,15 @@ define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val
; VI-LABEL: s_ctpop_i128:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
-; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; VI-NEXT: s_add_i32 s4, s4, s6
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
+; VI-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
+; VI-NEXT: s_add_i32 s0, s1, s0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
%ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
%truncctpop = trunc i128 %ctpop to i32
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index 483402d..e1b01c0 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -35,15 +35,15 @@ define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) n
;
; VI-LABEL: s_cttz_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ff1_i32_b32 s4, s4
-; VI-NEXT: s_min_u32 s4, s4, 32
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_ff1_i32_b32 s0, s2
+; VI-NEXT: s_min_u32 s0, s0, 32
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_cttz_i32:
@@ -519,16 +519,16 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32],
;
; VI-LABEL: s_cttz_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ff1_i32_b64 s4, s[4:5]
-; VI-NEXT: s_min_u32 s4, s4, 64
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_ff1_i32_b64 s0, s[2:3]
+; VI-NEXT: s_min_u32 s0, s0, 64
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_cttz_i64:
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index a6cbfa5..7eb2e52 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -28,13 +28,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out,
;
; VI-LABEL: s_cttz_zero_undef_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ff1_i32_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_ff1_i32_b32 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -317,13 +317,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa
;
; VI-LABEL: s_cttz_zero_undef_i8_with_select:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ff1_i32_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_ff1_i32_b32 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -386,13 +386,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no
;
; VI-LABEL: s_cttz_zero_undef_i16_with_select:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ff1_i32_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_ff1_i32_b32 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -455,13 +455,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
;
; VI-LABEL: s_cttz_zero_undef_i32_with_select:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ff1_i32_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_ff1_i32_b32 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index fd4e182..e6d68a1 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -2788,36 +2788,36 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: cvt_ubyte0_or_multiuse:
; VI: ; %bb.0: ; %bb
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
; VI-NEXT: v_add_f32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: cvt_ubyte0_or_multiuse:
; GFX10: ; %bb.0: ; %bb
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[0:1]
+; GFX10-NEXT: global_load_dword v0, v0, s[4:5]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_or_b32_e32 v0, 0x80000001, v0
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT: global_store_dword v2, v0, s[2:3]
+; GFX10-NEXT: global_store_dword v2, v0, s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX9-LABEL: cvt_ubyte0_or_multiuse:
@@ -2836,17 +2836,17 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr
;
; GFX11-LABEL: cvt_ubyte0_or_multiuse:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_or_b32_e32 v0, 0x80000001, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: global_store_b32 v2, v0, s[2:3]
+; GFX11-NEXT: global_store_b32 v2, v0, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
index fed4b98..37b4dfa 100644
--- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
@@ -8,13 +8,13 @@
define protected amdgpu_kernel void @add(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: add:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_add v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_add v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -30,13 +30,13 @@ define protected amdgpu_kernel void @add(ptr addrspace(1) %p, ptr addrspace(1) %
define protected amdgpu_kernel void @sub(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: sub:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_sub v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_sub v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -52,13 +52,13 @@ define protected amdgpu_kernel void @sub(ptr addrspace(1) %p, ptr addrspace(1) %
define protected amdgpu_kernel void @and(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: and:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_and v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_and v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -74,13 +74,13 @@ define protected amdgpu_kernel void @and(ptr addrspace(1) %p, ptr addrspace(1) %
define protected amdgpu_kernel void @or(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: or:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_or v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_or v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -96,13 +96,13 @@ define protected amdgpu_kernel void @or(ptr addrspace(1) %p, ptr addrspace(1) %q
define protected amdgpu_kernel void @xor(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: xor:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_xor v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_xor v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -118,28 +118,28 @@ define protected amdgpu_kernel void @xor(ptr addrspace(1) %p, ptr addrspace(1) %
define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: nand:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; CHECK-NEXT: s_mov_b64 s[0:1], 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
+; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v0, s2
; CHECK-NEXT: .LBB5_1: ; %atomicrmw.start
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_mov_b32_e32 v3, v0
; CHECK-NEXT: v_not_b32_e32 v0, v3
; CHECK-NEXT: v_or_b32_e32 v2, -2, v0
-; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
+; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
-; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1]
; CHECK-NEXT: s_cbranch_execnz .LBB5_1
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
-; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT: v_mov_b32_e32 v2, s2
-; CHECK-NEXT: v_mov_b32_e32 v3, s3
+; CHECK-NEXT: s_or_b64 exec, exec, s[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v2, s6
+; CHECK-NEXT: v_mov_b32_e32 v3, s7
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
@@ -154,13 +154,13 @@ define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1)
define protected amdgpu_kernel void @max_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: max_workgroup:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -176,13 +176,13 @@ define protected amdgpu_kernel void @max_workgroup(ptr addrspace(1) %p, ptr addr
define protected amdgpu_kernel void @max(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: max:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -198,13 +198,13 @@ define protected amdgpu_kernel void @max(ptr addrspace(1) %p, ptr addrspace(1) %
define protected amdgpu_kernel void @min_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: min_workgroup:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -220,13 +220,13 @@ define protected amdgpu_kernel void @min_workgroup(ptr addrspace(1) %p, ptr addr
define protected amdgpu_kernel void @min(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: min:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -242,13 +242,13 @@ define protected amdgpu_kernel void @min(ptr addrspace(1) %p, ptr addrspace(1) %
define protected amdgpu_kernel void @umax_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: umax_workgroup:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -264,13 +264,13 @@ define protected amdgpu_kernel void @umax_workgroup(ptr addrspace(1) %p, ptr add
define protected amdgpu_kernel void @umax(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: umax:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -286,13 +286,13 @@ define protected amdgpu_kernel void @umax(ptr addrspace(1) %p, ptr addrspace(1)
define protected amdgpu_kernel void @umin_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: umin_workgroup:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -308,13 +308,13 @@ define protected amdgpu_kernel void @umin_workgroup(ptr addrspace(1) %p, ptr add
define protected amdgpu_kernel void @umin(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: umin:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -330,14 +330,14 @@ define protected amdgpu_kernel void @umin(ptr addrspace(1) %p, ptr addrspace(1)
define protected amdgpu_kernel void @cmpxchg(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: cmpxchg:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_mov_b32_e32 v0, 2
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_cmpswap v2, v2, v[0:1], s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_cmpswap v2, v2, v[0:1], s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -354,13 +354,13 @@ define protected amdgpu_kernel void @cmpxchg(ptr addrspace(1) %p, ptr addrspace(
define protected amdgpu_kernel void @xchg(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: xchg:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_swap v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_swap v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -376,13 +376,13 @@ define protected amdgpu_kernel void @xchg(ptr addrspace(1) %p, ptr addrspace(1)
define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: inc:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_inc v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_inc v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -398,13 +398,13 @@ define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) %
define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: dec:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_dec v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_dec v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -420,28 +420,28 @@ define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) %
define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: fadd:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; CHECK-NEXT: s_mov_b64 s[0:1], 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
+; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v0, s2
; CHECK-NEXT: .LBB18_1: ; %atomicrmw.start
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_mov_b32_e32 v3, v0
; CHECK-NEXT: v_add_f32_e32 v2, 1.0, v3
-; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
+; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
-; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1]
; CHECK-NEXT: s_cbranch_execnz .LBB18_1
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
-; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_or_b64 exec, exec, s[0:1]
; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v0
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
@@ -457,28 +457,28 @@ define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1)
define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: fsub:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; CHECK-NEXT: s_mov_b64 s[0:1], 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
+; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v0, s2
; CHECK-NEXT: .LBB19_1: ; %atomicrmw.start
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_mov_b32_e32 v3, v0
; CHECK-NEXT: v_add_f32_e32 v2, -1.0, v3
-; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
+; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
-; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1]
; CHECK-NEXT: s_cbranch_execnz .LBB19_1
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
-; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_or_b64 exec, exec, s[0:1]
; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v0
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
@@ -494,14 +494,14 @@ define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1)
define protected amdgpu_kernel void @fmin(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: fmin:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v2, s2
-; CHECK-NEXT: v_mov_b32_e32 v3, s3
+; CHECK-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v2, s6
+; CHECK-NEXT: v_mov_b32_e32 v3, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
@@ -519,14 +519,14 @@ define protected amdgpu_kernel void @fmin(ptr addrspace(1) %p, ptr addrspace(1)
define protected amdgpu_kernel void @fmax(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: fmax:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v2, s2
-; CHECK-NEXT: v_mov_b32_e32 v3, s3
+; CHECK-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v2, s6
+; CHECK-NEXT: v_mov_b32_e32 v3, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
@@ -547,13 +547,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.swap(ptr addrspace(8) %rs
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_swap v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.swap.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -569,13 +569,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.add(ptr addrspace(8) %rsr
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -591,13 +591,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.sub(ptr addrspace(8) %rsr
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_sub v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -613,13 +613,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.smin(ptr addrspace(8) %rs
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_smin v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.smin.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -635,13 +635,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.smax(ptr addrspace(8) %rs
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_smax v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.smax.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -657,13 +657,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.umin(ptr addrspace(8) %rs
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_umin v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.umin.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -679,13 +679,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.umax(ptr addrspace(8) %rs
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_umax v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.umax.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -701,13 +701,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.and(ptr addrspace(8) %rsr
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_and v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.and.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -723,13 +723,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.or(ptr addrspace(8) %rsrc
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_or v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.or.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -745,13 +745,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.xor(ptr addrspace(8) %rsr
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_xor v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.xor.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -767,13 +767,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.inc(ptr addrspace(8) %rsr
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_inc v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.inc.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -789,13 +789,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.dec(ptr addrspace(8) %rsr
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_dec v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.dec.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -812,13 +812,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.cmpswap(ptr addrspace(8)
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v1, 2
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v2, s2
; CHECK-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i32(i32 1, i32 2, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -834,14 +834,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.fadd(ptr addrspace(8) %rs
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v1, 1.0
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s2
; CHECK-NEXT: buffer_atomic_add_f32 v1, v0, s[4:7], 0 offen glc
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v1
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%f32 = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float 1.0, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -859,14 +860,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.fmin(ptr addrspace(8) %rs
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v2, s2
; CHECK-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen glc
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%f64 = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double 1.0, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -884,14 +886,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.fmax(ptr addrspace(8) %rs
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v2, s2
; CHECK-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen glc
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%f64 = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double 1.0, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index 67b0cef..cff77bf 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -42,13 +42,13 @@ define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) {
; GFX11-LABEL: uniform_vec_0_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-NEXT: s_lshl_b32 s0, s4, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -127,13 +127,13 @@ define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) {
; GFX11-LABEL: uniform_vec_i16_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT: s_and_b32 s0, 0xffff, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -212,13 +212,13 @@ define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) {
; GFX11-LABEL: uniform_vec_f16_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT: s_and_b32 s0, 0xffff, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -277,12 +277,12 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
;
; GFX9-LABEL: uniform_vec_i16_LL:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s0
; GFX9-NEXT: ;;#ASMEND
@@ -290,12 +290,12 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
;
; GFX906-LABEL: uniform_vec_i16_LL:
; GFX906: ; %bb.0:
-; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX906-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX906-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: s_pack_ll_b32_b16 s0, s4, s5
+; GFX906-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s0
; GFX906-NEXT: ;;#ASMEND
@@ -303,10 +303,10 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
;
; GFX11-LABEL: uniform_vec_i16_LL:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-NEXT: ;;#ASMSTART
@@ -561,12 +561,12 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
;
; GFX9-LABEL: uniform_vec_f16_LL:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s0
; GFX9-NEXT: ;;#ASMEND
@@ -574,12 +574,12 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
;
; GFX906-LABEL: uniform_vec_f16_LL:
; GFX906: ; %bb.0:
-; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX906-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX906-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: s_pack_ll_b32_b16 s0, s4, s5
+; GFX906-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s0
; GFX906-NEXT: ;;#ASMEND
@@ -587,10 +587,10 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
;
; GFX11-LABEL: uniform_vec_f16_LL:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-NEXT: ;;#ASMSTART
@@ -723,13 +723,13 @@ define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in,
; GFX11-LABEL: build_vec_v2i16_undeflo_uniform:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
; GFX11-NEXT: ds_load_u16_d16 v0, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
index b0e1da3..b5933b4 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
@@ -7,11 +7,11 @@
define amdgpu_kernel void @ds1align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds1align1:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_u8 v0, v0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b8 v1, v0
; GCN-NEXT: s_endpgm
@@ -23,12 +23,12 @@ define amdgpu_kernel void @ds1align1(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds2align1:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0
; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:1
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s3
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
; ALIGNED-SDAG-NEXT: ds_write_b8 v2, v1
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
@@ -37,12 +37,12 @@ define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; ALIGNED-GISEL-LABEL: ds2align1:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:1
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 8, v1
; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0
@@ -52,11 +52,11 @@ define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; UNALIGNED-LABEL: ds2align1:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_u16 v0, v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v1, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b16 v1, v0
; UNALIGNED-NEXT: s_endpgm
@@ -68,11 +68,11 @@ define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds2align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds2align2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_u16 v0, v0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b16 v1, v0
; GCN-NEXT: s_endpgm
@@ -84,14 +84,14 @@ define amdgpu_kernel void @ds2align2(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds4align1:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0
; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1
; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2
; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:3
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s3
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v1
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
@@ -104,15 +104,15 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; ALIGNED-GISEL-LABEL: ds4align1:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, 8
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1
; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:3
; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:2
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1)
@@ -130,11 +130,11 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; UNALIGNED-LABEL: ds4align1:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_b32 v0, v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v1, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b32 v1, v0
; UNALIGNED-NEXT: s_endpgm
@@ -146,12 +146,12 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds4align2:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0
; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:2
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s3
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
; ALIGNED-SDAG-NEXT: ds_write_b16 v2, v1
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
@@ -160,12 +160,12 @@ define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; ALIGNED-GISEL-LABEL: ds4align2:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:2
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v0
@@ -174,11 +174,11 @@ define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; UNALIGNED-LABEL: ds4align2:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_b32 v0, v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v1, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b32 v1, v0
; UNALIGNED-NEXT: s_endpgm
@@ -190,11 +190,11 @@ define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds4align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds4align4:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v0, v0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b32 v1, v0
; GCN-NEXT: s_endpgm
@@ -206,9 +206,9 @@ define amdgpu_kernel void @ds4align4(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds8align1:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0
; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1
; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2
@@ -217,7 +217,7 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out
; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5
; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:6
; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:7
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v7, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v7, s3
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v5 offset:4
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
@@ -234,9 +234,9 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; ALIGNED-GISEL-LABEL: ds8align1:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1
; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2
@@ -258,7 +258,7 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out
; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v7
; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v3, v2
; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v2, 8, v1
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s3
; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1
; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v2 offset:1
; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, 8
@@ -275,11 +275,11 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; UNALIGNED-LABEL: ds8align1:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_b64 v[0:1], v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v2, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b64 v2, v[0:1]
; UNALIGNED-NEXT: s_endpgm
@@ -291,14 +291,14 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds8align2:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:4
; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0
; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2
; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:6
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s3
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v1 offset:4
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
@@ -311,14 +311,14 @@ define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; ALIGNED-GISEL-LABEL: ds8align2:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2
; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4
; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:6
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -331,11 +331,11 @@ define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; UNALIGNED-LABEL: ds8align2:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_b64 v[0:1], v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v2, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b64 v2, v[0:1]
; UNALIGNED-NEXT: s_endpgm
@@ -347,11 +347,11 @@ define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds8align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds8align4:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
-; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write2_b32 v2, v0, v1 offset1:1
; GCN-NEXT: s_endpgm
@@ -363,11 +363,11 @@ define amdgpu_kernel void @ds8align4(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds8align8(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds8align8:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b64 v[0:1], v0
-; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b64 v2, v[0:1]
; GCN-NEXT: s_endpgm
@@ -379,9 +379,9 @@ define amdgpu_kernel void @ds8align8(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds12align1:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0
; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1
; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2
@@ -394,7 +394,7 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
; ALIGNED-SDAG-NEXT: ds_read_u8 v10, v0 offset:9
; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10
; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:11
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v12, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v12, s3
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v9 offset:8
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
@@ -415,9 +415,9 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; ALIGNED-GISEL-LABEL: ds12align1:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1
; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2
@@ -449,7 +449,7 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0
; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v4, v3
; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v3, 8, v1
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s3
; ALIGNED-GISEL-NEXT: v_or3_b32 v2, v6, v7, v2
; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1
; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v3 offset:1
@@ -473,11 +473,11 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-LABEL: ds12align1:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v3, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2]
; UNALIGNED-NEXT: s_endpgm
@@ -489,15 +489,15 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds12align2:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:8
; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0
; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2
; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4
; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v6, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v6, s3
; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:10
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5)
; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v1 offset:8
@@ -513,16 +513,16 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; ALIGNED-GISEL-LABEL: ds12align2:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2
; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4
; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:6
; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:8
; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:10
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v6, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v6, s3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4)
; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
@@ -539,11 +539,11 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-LABEL: ds12align2:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v3, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2]
; UNALIGNED-NEXT: s_endpgm
@@ -555,12 +555,12 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-LABEL: ds12align4:
; ALIGNED: ; %bb.0:
-; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-NEXT: v_mov_b32_e32 v2, s0
+; ALIGNED-NEXT: v_mov_b32_e32 v2, s2
; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
; ALIGNED-NEXT: ds_read_b32 v2, v2 offset:8
-; ALIGNED-NEXT: v_mov_b32_e32 v3, s1
+; ALIGNED-NEXT: v_mov_b32_e32 v3, s3
; ALIGNED-NEXT: s_waitcnt lgkmcnt(1)
; ALIGNED-NEXT: ds_write2_b32 v3, v0, v1 offset1:1
; ALIGNED-NEXT: s_waitcnt lgkmcnt(1)
@@ -569,12 +569,12 @@ define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-SDAG-LABEL: ds12align4:
; UNALIGNED-SDAG: ; %bb.0:
-; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s2
; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
; UNALIGNED-SDAG-NEXT: ds_read_b32 v2, v2 offset:8
-; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s3
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
; UNALIGNED-SDAG-NEXT: ds_write2_b32 v3, v0, v1 offset1:1
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
@@ -583,11 +583,11 @@ define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-GISEL-LABEL: ds12align4:
; UNALIGNED-GISEL: ; %bb.0:
-; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-GISEL-NEXT: ds_read_b96 v[0:2], v0
-; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s3
; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-GISEL-NEXT: ds_write_b96 v3, v[0:2]
; UNALIGNED-GISEL-NEXT: s_endpgm
@@ -599,12 +599,12 @@ define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %ou
define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds12align8:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s2
; ALIGNED-SDAG-NEXT: ds_read_b64 v[0:1], v2
; ALIGNED-SDAG-NEXT: ds_read_b32 v2, v2 offset:8
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s3
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
; ALIGNED-SDAG-NEXT: ds_write_b64 v3, v[0:1]
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
@@ -613,12 +613,12 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; ALIGNED-GISEL-LABEL: ds12align8:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s2
; ALIGNED-GISEL-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
; ALIGNED-GISEL-NEXT: ds_read_b32 v2, v2 offset:8
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1)
; ALIGNED-GISEL-NEXT: ds_write2_b32 v3, v0, v1 offset1:1
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1)
@@ -627,12 +627,12 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-SDAG-LABEL: ds12align8:
; UNALIGNED-SDAG: ; %bb.0:
-; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-SDAG-NEXT: ds_read_b32 v2, v0 offset:8
; UNALIGNED-SDAG-NEXT: ds_read_b64 v[0:1], v0
-; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s3
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
; UNALIGNED-SDAG-NEXT: ds_write_b32 v3, v2 offset:8
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
@@ -641,11 +641,11 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-GISEL-LABEL: ds12align8:
; UNALIGNED-GISEL: ; %bb.0:
-; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-GISEL-NEXT: ds_read_b96 v[0:2], v0
-; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s3
; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-GISEL-NEXT: ds_write_b96 v3, v[0:2]
; UNALIGNED-GISEL-NEXT: s_endpgm
@@ -657,11 +657,11 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou
define amdgpu_kernel void @ds12align16(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds12align16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b96 v[0:2], v0
-; GCN-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b96 v3, v[0:2]
; GCN-NEXT: s_endpgm
@@ -673,9 +673,9 @@ define amdgpu_kernel void @ds12align16(ptr addrspace(3) %in, ptr addrspace(3) %o
define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds16align1:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0
; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1
; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2
@@ -692,7 +692,7 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
; ALIGNED-SDAG-NEXT: ds_read_u8 v14, v0 offset:13
; ALIGNED-SDAG-NEXT: ds_read_u8 v15, v0 offset:14
; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:15
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v16, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v16, s3
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v13 offset:12
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
@@ -716,9 +716,9 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; ALIGNED-GISEL-LABEL: ds16align1:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1
; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2
@@ -760,7 +760,7 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v9
; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v5, v4
; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v4, 8, v1
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s3
; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1
; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v4 offset:1
; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, 8
@@ -789,11 +789,11 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-LABEL: ds16align1:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v4, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3]
; UNALIGNED-NEXT: s_endpgm
@@ -805,9 +805,9 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds16align2:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:12
; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0
; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2
@@ -815,7 +815,7 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6
; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:8
; ALIGNED-SDAG-NEXT: ds_read_u16 v7, v0 offset:10
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v8, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v8, s3
; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:14
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7)
; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v1 offset:12
@@ -835,9 +835,9 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; ALIGNED-GISEL-LABEL: ds16align2:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2
; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4
@@ -850,7 +850,7 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4)
; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v6, 16, v5
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -867,11 +867,11 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-LABEL: ds16align2:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v4, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3]
; UNALIGNED-NEXT: s_endpgm
@@ -883,12 +883,12 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-LABEL: ds16align4:
; ALIGNED: ; %bb.0:
-; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-NEXT: v_mov_b32_e32 v2, s0
+; ALIGNED-NEXT: v_mov_b32_e32 v2, s2
; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
; ALIGNED-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
-; ALIGNED-NEXT: v_mov_b32_e32 v4, s1
+; ALIGNED-NEXT: v_mov_b32_e32 v4, s3
; ALIGNED-NEXT: s_waitcnt lgkmcnt(1)
; ALIGNED-NEXT: ds_write2_b32 v4, v0, v1 offset1:1
; ALIGNED-NEXT: s_waitcnt lgkmcnt(1)
@@ -897,12 +897,12 @@ define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-SDAG-LABEL: ds16align4:
; UNALIGNED-SDAG: ; %bb.0:
-; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s2
; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset0:2 offset1:3
; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[2:3], v2 offset1:1
-; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s3
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
; UNALIGNED-SDAG-NEXT: ds_write2_b32 v4, v0, v1 offset0:2 offset1:3
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
@@ -911,11 +911,11 @@ define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-GISEL-LABEL: ds16align4:
; UNALIGNED-GISEL: ; %bb.0:
-; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-GISEL-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
-; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s3
; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-GISEL-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
; UNALIGNED-GISEL-NEXT: s_endpgm
@@ -927,11 +927,11 @@ define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %ou
define amdgpu_kernel void @ds16align8(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds16align8:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
-; GCN-NEXT: v_mov_b32_e32 v4, s1
+; GCN-NEXT: v_mov_b32_e32 v4, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
; GCN-NEXT: s_endpgm
@@ -943,11 +943,11 @@ define amdgpu_kernel void @ds16align8(ptr addrspace(3) %in, ptr addrspace(3) %ou
define amdgpu_kernel void @ds16align16(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds16align16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b128 v[0:3], v0
-; GCN-NEXT: v_mov_b32_e32 v4, s1
+; GCN-NEXT: v_mov_b32_e32 v4, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b128 v4, v[0:3]
; GCN-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll b/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll
index 5814b8a..4cd5835 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll
@@ -36,7 +36,7 @@ define amdgpu_kernel void @ds_combine_nodep(ptr addrspace(1) %out, ptr addrspace
; GCN-LABEL: {{^}}ds_combine_WAR
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:27
-; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
+; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
define amdgpu_kernel void @ds_combine_WAR(ptr addrspace(1) %out, ptr addrspace(3) %inptr) {
%addr0 = getelementptr i8, ptr addrspace(3) %inptr, i32 100
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 06908d2..ee374bd 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -105,12 +105,12 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1)
;
; GFX9-LABEL: simple_write2_two_val_f32_volatile_0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: ds_write_b32 v0, v2 offset:32
@@ -151,12 +151,12 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1)
;
; GFX9-LABEL: simple_write2_two_val_f32_volatile_1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: ds_write_b32 v0, v2 offset:32
@@ -368,11 +368,11 @@ define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C
;
; GFX9-LABEL: simple_write2_two_val_too_far_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -413,11 +413,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr
;
; GFX9-LABEL: simple_write2_two_val_f32_x2:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
@@ -469,11 +469,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspa
;
; GFX9-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:8
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
@@ -963,11 +963,11 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3)
;
; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
; GFX9-ALIGNED: ; %bb.0:
-; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX9-ALIGNED-NEXT: s_load_dword s6, s[0:1], 0x0
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4
-; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s6
+; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s0
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s1
@@ -979,11 +979,11 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3)
;
; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
; GFX9-UNALIGNED: ; %bb.0:
-; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX9-UNALIGNED-NEXT: s_load_dword s6, s[0:1], 0x0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4
-; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s6
+; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s2
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s3
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index db3ea4d..e16bb28 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -411,15 +411,15 @@ entry:
; GCN-LABEL: {{^}}bit4_extelt:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s2, s2, 3
-; GCN-NEXT: s_lshr_b32 s2, 0x1000100, s2
-; GCN-NEXT: s_and_b32 s2, s2, 1
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: s_lshl_b32 s0, s4, 3
+; GCN-NEXT: s_lshr_b32 s0, 0x1000100, s0
+; GCN-NEXT: s_and_b32 s0, s0, 1
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dword v[0:1], v2
define amdgpu_kernel void @bit4_extelt(ptr addrspace(1) %out, i32 %sel) {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index 44d65c9..6823dcf 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -21,32 +21,32 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: extract_vector_elt_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s5, s4, 16
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_short v1, off, s[0:3], 0
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:20
+; VI-NEXT: s_lshr_b32 s1, s0, 16
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_short v1, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:20
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: extract_vector_elt_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_mov_b32_e32 v1, s0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1]
-; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:20
+; GFX11-NEXT: global_store_d16_hi_b16 v0, v1, s[4:5]
+; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:20
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -140,6 +140,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1
; VI-LABEL: extract_vector_elt_v2f16_dynamic_vgpr:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
@@ -147,35 +148,33 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dword v2, v[1:2]
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_load_dword s1, s[2:3], 0x0
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e64 v2, v2, s1
+; VI-NEXT: v_lshrrev_b32_e64 v2, v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: extract_vector_elt_v2f16_dynamic_vgpr:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v1, s[2:3]
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s0
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -317,18 +316,18 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: v_extractelement_v4f16_2:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dword v2, v[1:2]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_short v[0:1], v2
@@ -336,13 +335,13 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a
;
; GFX11-LABEL: v_extractelement_v4f16_2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v1, s[2:3] offset:4
+; GFX11-NEXT: global_load_b32 v1, v1, s[6:7] offset:4
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -380,43 +379,42 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) %
;
; VI-LABEL: v_insertelement_v4f16_dynamic_vgpr:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
-; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
+; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b64 v[0:1], v0, v[1:2]
-; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v4
+; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
; VI-NEXT: flat_store_short v[1:2], v0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: v_insertelement_v4f16_dynamic_vgpr:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
-; GFX11-NEXT: buffer_load_b32 v3, off, s[4:7], 0 glc dlc
+; GFX11-NEXT: buffer_load_b32 v3, off, s[0:3], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3]
+; GFX11-NEXT: global_load_b64 v[1:2], v1, s[6:7]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 4, v3
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b64 v[1:2], v3, v[1:2]
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -451,12 +449,12 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4)
;
; VI-LABEL: reduce_load_vector_v8f16_extract_01:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -468,12 +466,12 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4)
;
; GFX11-LABEL: reduce_load_vector_v8f16_extract_01:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshr_b32 s1, s0, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
@@ -512,12 +510,12 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4)
;
; VI-LABEL: reduce_load_vector_v8f16_extract_23:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dword s0, s[2:3], 0x4
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x4
-; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -529,12 +527,12 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4)
;
; GFX11-LABEL: reduce_load_vector_v8f16_extract_23:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x4
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshr_b32 s1, s0, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll
index 8f0d639..b243450 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -149,19 +149,19 @@ define amdgpu_kernel void @fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
;
; VI-LABEL: fabsf_v4f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: s_bitset0_b32 s3, 31
-; VI-NEXT: s_bitset0_b32 s2, 31
-; VI-NEXT: s_bitset0_b32 s1, 31
-; VI-NEXT: s_bitset0_b32 s0, 31
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_and_b32 s0, s7, 0x7fffffff
+; VI-NEXT: s_and_b32 s1, s6, 0x7fffffff
+; VI-NEXT: s_bitset0_b32 s5, 31
+; VI-NEXT: s_bitset0_b32 s4, 31
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
%fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
index cdc6b5a..00d77de 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -36,50 +36,50 @@ define amdgpu_kernel void @fadd_f16(
; VI-LABEL: fadd_f16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s2, s10
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s4
-; VI-NEXT: s_mov_b32 s9, s5
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: s_mov_b32 s3, s11
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
+; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f16_e32 v0, v0, v1
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fadd_f16:
; GFX11-SDAG: ; %bb.0: ; %entry
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-SDAG-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s3, s11
-; GFX11-SDAG-NEXT: s_mov_b32 s2, s10
+; GFX11-SDAG-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s4
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s5
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_mov_b32 s4, s6
; GFX11-SDAG-NEXT: s_mov_b32 s5, s7
-; GFX11-SDAG-NEXT: s_mov_b32 s6, s10
-; GFX11-SDAG-NEXT: s_mov_b32 s7, s11
+; GFX11-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s7, s3
; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[4:7], 0 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-SDAG-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, v1.l
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
-; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
@@ -88,17 +88,17 @@ define amdgpu_kernel void @fadd_f16(
; GFX11-GISEL: ; %bb.0: ; %entry
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-GISEL-NEXT: s_mov_b32 s10, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
+; GFX11-GISEL-NEXT: s_mov_b64 s[10:11], s[2:3]
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX11-GISEL-NEXT: s_mov_b64 s[6:7], s[10:11]
-; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
+; GFX11-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX11-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[0:3], 0 glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-GISEL-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.h, v1.l
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -112,24 +112,24 @@ define amdgpu_kernel void @fadd_f16(
; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry
; GFX11-FAKE16-SDAG-NEXT: s_clause 0x1
; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FAKE16-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s10, -1
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, s11
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, s10
+; GFX11-FAKE16-SDAG-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s11, s3
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s10, s2
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s8, s4
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s9, s5
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s6
; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s7
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, s10
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, s11
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, s3
; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v0, off, s[4:7], 0 glc dlc
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-SDAG-NEXT: v_add_f16_e32 v0, v0, v1
-; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FAKE16-SDAG-NEXT: s_nop 0
; GFX11-FAKE16-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-SDAG-NEXT: s_endpgm
@@ -138,17 +138,17 @@ define amdgpu_kernel void @fadd_f16(
; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry
; GFX11-FAKE16-GISEL-NEXT: s_clause 0x1
; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s10, -1
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[10:11], s[2:3]
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[6:7], s[10:11]
-; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v0, off, s[0:3], 0 glc dlc
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-GISEL-NEXT: v_add_f16_e32 v0, v0, v1
; GFX11-FAKE16-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
@@ -216,94 +216,94 @@ define amdgpu_kernel void @fadd_f16_imm_a(
;
; VI-LABEL: fadd_f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_mov_b32 s0, s2
-; VI-NEXT: s_mov_b32 s1, s3
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s4, s6
+; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f16_e32 v0, 1.0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fadd_f16_imm_a:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
-; GFX11-SDAG-NEXT: s_mov_b32 s0, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s1, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s2, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s3, s7
-; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
+; GFX11-SDAG-NEXT: s_mov_b32 s4, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s5, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s7, s3
+; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[4:7], 0
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, 0x3c00
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
-; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fadd_f16_imm_a:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
-; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX11-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[0:3], 0
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.h, 0x3c00
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
-; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX11-FAKE16-SDAG-LABEL: fadd_f16_imm_a:
; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry
-; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s1
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s2
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s3
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, s6
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, s7
-; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s5
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s6
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s7
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, s3
+; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v0, off, s[4:7], 0
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-SDAG-NEXT: v_add_f16_e32 v0, 1.0, v0
-; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FAKE16-SDAG-NEXT: s_nop 0
; GFX11-FAKE16-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-SDAG-NEXT: s_endpgm
;
; GFX11-FAKE16-GISEL-LABEL: fadd_f16_imm_a:
; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry
-; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
-; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v0, off, s[0:3], 0
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-GISEL-NEXT: v_add_f16_e32 v0, 1.0, v0
-; GFX11-FAKE16-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-FAKE16-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-FAKE16-GISEL-NEXT: s_nop 0
; GFX11-FAKE16-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-GISEL-NEXT: s_endpgm
@@ -360,94 +360,94 @@ define amdgpu_kernel void @fadd_f16_imm_b(
;
; VI-LABEL: fadd_f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_mov_b32 s0, s2
-; VI-NEXT: s_mov_b32 s1, s3
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s4, s6
+; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f16_e32 v0, 2.0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fadd_f16_imm_b:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
-; GFX11-SDAG-NEXT: s_mov_b32 s0, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s1, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s2, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s3, s7
-; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
+; GFX11-SDAG-NEXT: s_mov_b32 s4, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s5, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s7, s3
+; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[4:7], 0
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, 0x4000
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
-; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fadd_f16_imm_b:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
-; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX11-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[0:3], 0
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.h, 0x4000
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
-; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX11-FAKE16-SDAG-LABEL: fadd_f16_imm_b:
; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry
-; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s1
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s2
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s3
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, s6
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, s7
-; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s5
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s6
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s7
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, s3
+; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v0, off, s[4:7], 0
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-SDAG-NEXT: v_add_f16_e32 v0, 2.0, v0
-; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FAKE16-SDAG-NEXT: s_nop 0
; GFX11-FAKE16-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-SDAG-NEXT: s_endpgm
;
; GFX11-FAKE16-GISEL-LABEL: fadd_f16_imm_b:
; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry
-; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
-; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v0, off, s[0:3], 0
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-GISEL-NEXT: v_add_f16_e32 v0, 2.0, v0
-; GFX11-FAKE16-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-FAKE16-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-FAKE16-GISEL-NEXT: s_nop 0
; GFX11-FAKE16-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-GISEL-NEXT: s_endpgm
@@ -566,12 +566,12 @@ define amdgpu_kernel void @fadd_v2f16(
; GFX11-GISEL: ; %bb.0: ; %entry
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -606,12 +606,12 @@ define amdgpu_kernel void @fadd_v2f16(
; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry
; GFX11-FAKE16-GISEL-NEXT: s_clause 0x1
; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-GISEL-NEXT: s_clause 0x1
; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1
; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -684,85 +684,85 @@ define amdgpu_kernel void @fadd_v2f16_imm_a(
;
; VI-LABEL: fadd_v2f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v1, 0x4000
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v0, 1.0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fadd_v2f16_imm_a:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_pk_add_f16 v0, 0x40003c00, v0
-; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fadd_v2f16_imm_a:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_pk_add_f16 v0, 0x40003c00, v0
-; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16_imm_a:
; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry
-; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-SDAG-NEXT: v_pk_add_f16 v0, 0x40003c00, v0
-; GFX11-FAKE16-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-FAKE16-SDAG-NEXT: s_nop 0
; GFX11-FAKE16-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-SDAG-NEXT: s_endpgm
;
; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16_imm_a:
; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry
-; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-GISEL-NEXT: v_pk_add_f16 v0, 0x40003c00, v0
-; GFX11-FAKE16-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-FAKE16-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-FAKE16-GISEL-NEXT: s_nop 0
; GFX11-FAKE16-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-GISEL-NEXT: s_endpgm
@@ -823,85 +823,85 @@ define amdgpu_kernel void @fadd_v2f16_imm_b(
;
; VI-LABEL: fadd_v2f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v1, 0x3c00
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v0, 2.0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fadd_v2f16_imm_b:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_pk_add_f16 v0, 0x3c004000, v0
-; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fadd_v2f16_imm_b:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_pk_add_f16 v0, 0x3c004000, v0
-; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16_imm_b:
; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry
-; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-SDAG-NEXT: v_pk_add_f16 v0, 0x3c004000, v0
-; GFX11-FAKE16-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-FAKE16-SDAG-NEXT: s_nop 0
; GFX11-FAKE16-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-SDAG-NEXT: s_endpgm
;
; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16_imm_b:
; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry
-; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-GISEL-NEXT: v_pk_add_f16 v0, 0x3c004000, v0
-; GFX11-FAKE16-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-FAKE16-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-FAKE16-GISEL-NEXT: s_nop 0
; GFX11-FAKE16-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 581b7b4..fb47dae 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -21,20 +21,20 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_undef_value_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_undef_value_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v0, s[0:1]
+; GFX9-NEXT: global_store_short v0, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_undef_value_f16:
@@ -49,10 +49,10 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace(
;
; GFX11-LABEL: test_fold_canonicalize_undef_value_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -64,10 +64,10 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace(
define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: v_test_canonicalize_var_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e32 v0, v0, v0
@@ -76,10 +76,10 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1
;
; GFX9-LABEL: v_test_canonicalize_var_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v0, v0, s[0:1]
+; GFX9-NEXT: global_load_ushort v0, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: global_store_short v[0:1], v0, off
@@ -100,10 +100,10 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1
;
; GFX11-LABEL: v_test_canonicalize_var_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: global_store_b16 v[0:1], v0, off
@@ -119,12 +119,12 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1
define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i16 zeroext %val.arg) #1 {
; VI-LABEL: s_test_canonicalize_var_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v2, s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -153,12 +153,12 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1
; GFX11-LABEL: s_test_canonicalize_var_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e64 v1, s2, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_max_f16_e64 v1, s4, s4
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -239,10 +239,10 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1
define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: v_test_canonicalize_fabs_var_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ushort v2, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e64 v2, |v2|, |v2|
@@ -251,13 +251,13 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_test_canonicalize_fabs_var_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e64 v1, |v1|, |v1|
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_canonicalize_fabs_var_f16:
@@ -275,13 +275,13 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou
;
; GFX11-LABEL: v_test_canonicalize_fabs_var_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
+; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, |v1|, |v1|
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -295,10 +295,10 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou
define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ushort v2, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e64 v2, -|v2|, -|v2|
@@ -307,13 +307,13 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1
;
; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e64 v1, -|v1|, -|v1|
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
@@ -331,13 +331,13 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1
;
; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
+; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1|
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -352,10 +352,10 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1
define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: v_test_canonicalize_fneg_var_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ushort v2, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e64 v2, -v2, -v2
@@ -364,13 +364,13 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_test_canonicalize_fneg_var_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e64 v1, -v1, -v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_canonicalize_fneg_var_f16:
@@ -388,13 +388,13 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou
;
; GFX11-LABEL: v_test_canonicalize_fneg_var_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
+; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -408,10 +408,10 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou
define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr addrspace(1) %out) #2 {
; VI-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ushort v2, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_f16_e32 v2, -1.0, v2
@@ -420,13 +420,13 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add
;
; GFX9-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e64 v1, -v1, -v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
@@ -444,13 +444,13 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add
;
; GFX11-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
+; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -464,10 +464,10 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add
define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(ptr addrspace(1) %out) #2 {
; VI-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ushort v2, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_f16_e64 v2, -1.0, |v2|
@@ -476,13 +476,13 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt
;
; GFX9-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e64 v1, -|v1|, -|v1|
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
@@ -500,13 +500,13 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt
;
; GFX11-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
+; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1|
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -521,20 +521,20 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt
define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_p0_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_p0_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v0, s[0:1]
+; GFX9-NEXT: global_store_short v0, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_p0_f16:
@@ -549,10 +549,10 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out)
;
; GFX11-LABEL: test_fold_canonicalize_p0_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -564,21 +564,21 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out)
define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_n0_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0xffff8000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_n0_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_n0_f16:
@@ -593,10 +593,10 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out)
;
; GFX11-LABEL: test_fold_canonicalize_n0_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -608,21 +608,21 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out)
define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_p1_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x3c00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_p1_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_p1_f16:
@@ -637,10 +637,10 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out)
;
; GFX11-LABEL: test_fold_canonicalize_p1_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -652,21 +652,21 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out)
define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_n1_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0xffffbc00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_n1_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffbc00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_n1_f16:
@@ -681,10 +681,10 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out)
;
; GFX11-LABEL: test_fold_canonicalize_n1_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbc00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -696,21 +696,21 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out)
define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_literal_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4c00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_literal_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x4c00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_literal_f16:
@@ -725,10 +725,10 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) %
;
; GFX11-LABEL: test_fold_canonicalize_literal_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -740,21 +740,21 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) %
define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x3ff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
@@ -769,10 +769,10 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1
;
; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -784,21 +784,21 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr addrspace(1) %out) #3 {
; VI-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x3ff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
@@ -813,10 +813,10 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad
;
; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -828,21 +828,21 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad
define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0xffff83ff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff83ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
@@ -857,10 +857,10 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1
;
; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -872,21 +872,21 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr addrspace(1) %out) #3 {
; VI-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0xffff83ff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff83ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
@@ -901,10 +901,10 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad
;
; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -916,21 +916,21 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad
define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_qnan_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7c00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_qnan_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_qnan_f16:
@@ -945,10 +945,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out
;
; GFX11-LABEL: test_fold_canonicalize_qnan_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -960,21 +960,21 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
@@ -989,10 +989,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp
;
; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1004,21 +1004,21 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
@@ -1033,10 +1033,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp
;
; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1048,21 +1048,21 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp
define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_snan0_value_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_snan0_value_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_snan0_value_f16:
@@ -1077,10 +1077,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace(
;
; GFX11-LABEL: test_fold_canonicalize_snan0_value_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1092,21 +1092,21 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace(
define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_snan1_value_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_snan1_value_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_snan1_value_f16:
@@ -1121,10 +1121,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace(
;
; GFX11-LABEL: test_fold_canonicalize_snan1_value_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1136,21 +1136,21 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace(
define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_snan2_value_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_snan2_value_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_snan2_value_f16:
@@ -1165,10 +1165,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace(
;
; GFX11-LABEL: test_fold_canonicalize_snan2_value_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1180,21 +1180,21 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace(
define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_snan3_value_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_snan3_value_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_snan3_value_f16:
@@ -1209,10 +1209,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace(
;
; GFX11-LABEL: test_fold_canonicalize_snan3_value_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1224,32 +1224,32 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace(
define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: v_test_canonicalize_var_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_or_b32_e32 v2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_canonicalize_var_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
+; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_canonicalize_var_v2f16:
@@ -1277,13 +1277,13 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out)
;
; GFX11-LABEL: v_test_canonicalize_var_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1298,33 +1298,33 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out)
define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: v_test_canonicalize_fabs_var_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_sdwa v1, |v0|, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e64 v0, |v0|, |v0|
; VI-NEXT: v_or_b32_e32 v2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_canonicalize_fabs_var_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
+; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_canonicalize_fabs_var_v2f16:
@@ -1352,15 +1352,15 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) %
;
; GFX11-LABEL: v_test_canonicalize_fabs_var_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1376,33 +1376,33 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) %
define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_sdwa v1, -|v0|, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e64 v0, -|v0|, -|v0|
; VI-NEXT: v_or_b32_e32 v2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
+; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16:
@@ -1431,15 +1431,15 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace
;
; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1456,32 +1456,32 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace
define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: v_test_canonicalize_fneg_var_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_sdwa v1, -v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e64 v0, -v0, -v0
; VI-NEXT: v_or_b32_e32 v2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_canonicalize_fneg_var_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
+; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_canonicalize_fneg_var_v2f16:
@@ -1510,13 +1510,13 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) %
;
; GFX11-LABEL: v_test_canonicalize_fneg_var_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1532,16 +1532,16 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) %
define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, i32 zeroext %val.arg) #1 {
; VI-LABEL: s_test_canonicalize_var_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s3, s2, 16
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_max_f16_e64 v0, s2, s2
+; VI-NEXT: s_lshr_b32 s0, s4, 16
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_max_f16_e64 v0, s4, s4
; VI-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1575,12 +1575,12 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out,
; GFX11-LABEL: s_test_canonicalize_var_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v1, s2, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_pk_max_f16 v1, s4, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1593,20 +1593,20 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out,
define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_p0_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_p0_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_p0_v2f16:
@@ -1621,10 +1621,10 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out
;
; GFX11-LABEL: test_fold_canonicalize_p0_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1636,21 +1636,21 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out
define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_n0_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x80008000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_n0_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x80008000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_n0_v2f16:
@@ -1665,10 +1665,10 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out
;
; GFX11-LABEL: test_fold_canonicalize_n0_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x80008000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1680,21 +1680,21 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out
define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_p1_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x3c003c00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_p1_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c003c00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_p1_v2f16:
@@ -1709,10 +1709,10 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out
;
; GFX11-LABEL: test_fold_canonicalize_p1_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c003c00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1724,21 +1724,21 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out
define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_n1_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0xbc00bc00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_n1_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0xbc00bc00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_n1_v2f16:
@@ -1753,10 +1753,10 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out
;
; GFX11-LABEL: test_fold_canonicalize_n1_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbc00bc00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1768,21 +1768,21 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out
define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_literal_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4c004c00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_literal_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x4c004c00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_literal_v2f16:
@@ -1797,10 +1797,10 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1)
;
; GFX11-LABEL: test_fold_canonicalize_literal_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c004c00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1812,21 +1812,21 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1)
define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x3ff03ff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff03ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16:
@@ -1841,10 +1841,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(p
;
; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1856,21 +1856,21 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(p
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr addrspace(1) %out) #3 {
; VI-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x3ff03ff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff03ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16:
@@ -1885,10 +1885,10 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr
;
; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1900,21 +1900,21 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr
define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x83ff83ff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x83ff83ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16:
@@ -1929,10 +1929,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(p
;
; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1944,21 +1944,21 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(p
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr addrspace(1) %out) #3 {
; VI-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x83ff83ff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x83ff83ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16:
@@ -1973,10 +1973,10 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr
;
; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1988,21 +1988,21 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr
define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_qnan_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7c007c00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_qnan_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c007c00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_qnan_v2f16:
@@ -2017,10 +2017,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %o
;
; GFX11-LABEL: test_fold_canonicalize_qnan_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c007c00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2032,21 +2032,21 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %o
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16:
@@ -2061,10 +2061,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addr
;
; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2076,21 +2076,21 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addr
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16:
@@ -2105,10 +2105,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addr
;
; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2120,21 +2120,21 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addr
define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_snan0_value_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_snan0_value_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_snan0_value_v2f16:
@@ -2149,10 +2149,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspac
;
; GFX11-LABEL: test_fold_canonicalize_snan0_value_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2164,21 +2164,21 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspac
define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_snan1_value_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_snan1_value_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_snan1_value_v2f16:
@@ -2193,10 +2193,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspac
;
; GFX11-LABEL: test_fold_canonicalize_snan1_value_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2208,21 +2208,21 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspac
define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_snan2_value_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_snan2_value_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_snan2_value_v2f16:
@@ -2237,10 +2237,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspac
;
; GFX11-LABEL: test_fold_canonicalize_snan2_value_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2252,21 +2252,21 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspac
define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_snan3_value_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_snan3_value_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_snan3_value_v2f16:
@@ -2281,10 +2281,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspac
;
; GFX11-LABEL: test_fold_canonicalize_snan3_value_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2376,20 +2376,20 @@ define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 {
define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: s_test_canonicalize_undef_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_canonicalize_undef_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: s_test_canonicalize_undef_v2f16:
@@ -2404,10 +2404,10 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out
;
; GFX11-LABEL: s_test_canonicalize_undef_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2678,22 +2678,22 @@ define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 {
define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: s_test_canonicalize_undef_v4f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_canonicalize_undef_v4f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: s_test_canonicalize_undef_v4f16:
@@ -2709,12 +2709,12 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out
;
; GFX11-LABEL: s_test_canonicalize_undef_v4f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
index 7d8f43b..038aad3 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
@@ -58,25 +58,25 @@ define amdgpu_kernel void @fcmp_f16_lt(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -147,26 +147,26 @@ define amdgpu_kernel void @fcmp_f16_lt_abs(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_lt_f16_e64 s0, |v0|, |v1|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, s0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -239,25 +239,25 @@ define amdgpu_kernel void @fcmp_f16_eq(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -328,25 +328,25 @@ define amdgpu_kernel void @fcmp_f16_le(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -417,25 +417,25 @@ define amdgpu_kernel void @fcmp_f16_gt(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -506,25 +506,25 @@ define amdgpu_kernel void @fcmp_f16_lg(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -595,25 +595,25 @@ define amdgpu_kernel void @fcmp_f16_ge(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -684,25 +684,25 @@ define amdgpu_kernel void @fcmp_f16_o(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -773,25 +773,25 @@ define amdgpu_kernel void @fcmp_f16_u(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -862,25 +862,25 @@ define amdgpu_kernel void @fcmp_f16_nge(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -951,25 +951,25 @@ define amdgpu_kernel void @fcmp_f16_nlg(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1040,25 +1040,25 @@ define amdgpu_kernel void @fcmp_f16_ngt(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1129,25 +1129,25 @@ define amdgpu_kernel void @fcmp_f16_nle(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1218,25 +1218,25 @@ define amdgpu_kernel void @fcmp_f16_neq(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1307,25 +1307,25 @@ define amdgpu_kernel void @fcmp_f16_nlt(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1406,20 +1406,20 @@ define amdgpu_kernel void @fcmp_v2f16_lt(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1429,7 +1429,7 @@ define amdgpu_kernel void @fcmp_v2f16_lt(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1511,20 +1511,20 @@ define amdgpu_kernel void @fcmp_v2f16_eq(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1534,7 +1534,7 @@ define amdgpu_kernel void @fcmp_v2f16_eq(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1615,20 +1615,20 @@ define amdgpu_kernel void @fcmp_v2f16_le(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1638,7 +1638,7 @@ define amdgpu_kernel void @fcmp_v2f16_le(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1719,20 +1719,20 @@ define amdgpu_kernel void @fcmp_v2f16_gt(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1742,7 +1742,7 @@ define amdgpu_kernel void @fcmp_v2f16_gt(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1824,20 +1824,20 @@ define amdgpu_kernel void @fcmp_v2f16_lg(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1847,7 +1847,7 @@ define amdgpu_kernel void @fcmp_v2f16_lg(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1929,20 +1929,20 @@ define amdgpu_kernel void @fcmp_v2f16_ge(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1952,7 +1952,7 @@ define amdgpu_kernel void @fcmp_v2f16_ge(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2034,20 +2034,20 @@ define amdgpu_kernel void @fcmp_v2f16_o(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2057,7 +2057,7 @@ define amdgpu_kernel void @fcmp_v2f16_o(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2139,20 +2139,20 @@ define amdgpu_kernel void @fcmp_v2f16_u(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2162,7 +2162,7 @@ define amdgpu_kernel void @fcmp_v2f16_u(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2243,20 +2243,20 @@ define amdgpu_kernel void @fcmp_v2f16_nge(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2266,7 +2266,7 @@ define amdgpu_kernel void @fcmp_v2f16_nge(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2347,20 +2347,20 @@ define amdgpu_kernel void @fcmp_v2f16_nlg(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2370,7 +2370,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlg(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2452,20 +2452,20 @@ define amdgpu_kernel void @fcmp_v2f16_ngt(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2475,7 +2475,7 @@ define amdgpu_kernel void @fcmp_v2f16_ngt(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2556,20 +2556,20 @@ define amdgpu_kernel void @fcmp_v2f16_nle(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2579,7 +2579,7 @@ define amdgpu_kernel void @fcmp_v2f16_nle(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2660,20 +2660,20 @@ define amdgpu_kernel void @fcmp_v2f16_neq(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2683,7 +2683,7 @@ define amdgpu_kernel void @fcmp_v2f16_neq(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2764,20 +2764,20 @@ define amdgpu_kernel void @fcmp_v2f16_nlt(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2787,7 +2787,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlt(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index fd80580..b2fadbd 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -31,16 +31,16 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag,
;
; VI-LABEL: s_copysign_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_movk_i32 s3, 0x7fff
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s4, s2, 16
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: v_bfi_b32 v2, s3, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 s1, s4, 16
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_bfi_b32 v2, s0, v0, v1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -61,15 +61,15 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag,
; GFX11-LABEL: s_copysign_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s3, s2, 16
+; GFX11-NEXT: s_lshr_b32 s0, s4, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v0, s3
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s4, v0
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -93,13 +93,13 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma
;
; VI-LABEL: s_test_copysign_f16_0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x7fff
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x7fff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -117,13 +117,13 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma
; GFX11-LABEL: s_test_copysign_f16_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-NEXT: s_and_b32 s0, s4, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -147,13 +147,13 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma
;
; VI-LABEL: s_test_copysign_f16_1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x7fff
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x7fff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -171,13 +171,13 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma
; GFX11-LABEL: s_test_copysign_f16_1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-NEXT: s_and_b32 s0, s4, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -201,13 +201,13 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half
;
; VI-LABEL: s_test_copysign_f16_10.0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x7fff
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x7fff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -225,13 +225,13 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half
; GFX11-LABEL: s_test_copysign_f16_10.0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-NEXT: s_and_b32 s0, s4, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -255,13 +255,13 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half
;
; VI-LABEL: s_test_copysign_f16_neg1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s2, 15
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_or_b32 s0, s4, 0x8000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -279,13 +279,13 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half
; GFX11-LABEL: s_test_copysign_f16_neg1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset1_b32 s2, 15
+; GFX11-NEXT: s_or_b32 s0, s4, 0x8000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -309,13 +309,13 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half
;
; VI-LABEL: s_test_copysign_f16_neg10:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s2, 15
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_or_b32 s0, s4, 0x8000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -333,13 +333,13 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half
; GFX11-LABEL: s_test_copysign_f16_neg10:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset1_b32 s2, 15
+; GFX11-NEXT: s_or_b32 s0, s4, 0x8000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -365,13 +365,13 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half
;
; VI-LABEL: s_test_copysign_f16_0_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v2, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_and_b32_e32 v2, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -389,12 +389,12 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half
; GFX11-LABEL: s_test_copysign_f16_0_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e64 v1, 0xffff8000, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_and_b32_e64 v1, 0xffff8000, s4
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -421,14 +421,14 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half
;
; VI-LABEL: s_test_copysign_f16_1_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v0, s2, v0
+; VI-NEXT: v_and_b32_e32 v0, s4, v0
; VI-NEXT: v_or_b32_e32 v2, 0x3c00, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -447,14 +447,14 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half
; GFX11-LABEL: s_test_copysign_f16_1_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2
+; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -481,14 +481,14 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal
;
; VI-LABEL: s_test_copysign_f16_10_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v0, s2, v0
+; VI-NEXT: v_and_b32_e32 v0, s4, v0
; VI-NEXT: v_or_b32_e32 v2, 0x4900, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -507,14 +507,14 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal
; GFX11-LABEL: s_test_copysign_f16_10_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2
+; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -540,14 +540,14 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h
;
; VI-LABEL: s_test_copysign_f16_neg1_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v0, s2, v0
+; VI-NEXT: v_and_b32_e32 v0, s4, v0
; VI-NEXT: v_or_b32_e32 v2, 0x3c00, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -566,14 +566,14 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h
; GFX11-LABEL: s_test_copysign_f16_neg1_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2
+; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -600,14 +600,14 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out,
;
; VI-LABEL: s_test_copysign_f16_neg10_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v0, s2, v0
+; VI-NEXT: v_and_b32_e32 v0, s4, v0
; VI-NEXT: v_or_b32_e32 v2, 0x4900, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -626,14 +626,14 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out,
; GFX11-LABEL: s_test_copysign_f16_neg10_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2
+; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -850,19 +850,19 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1)
; VI-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_ushort v2, v[1:2]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -894,12 +894,12 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v1, s[6:7]
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -950,19 +950,19 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1)
; VI-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_ushort v2, v[1:2]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -993,13 +993,14 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1)
;
; GFX11-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v2, v1, s[6:7]
-; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -1050,19 +1051,19 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1)
; VI-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v4, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v2, v[0:1]
-; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -1093,19 +1094,19 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1)
; GFX11-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v1, s[4:5]
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v1, s[2:3]
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1151,19 +1152,19 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1)
; VI-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v4, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -1194,19 +1195,19 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1)
; GFX11-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v1, s[4:5]
-; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1252,19 +1253,19 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1)
; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v2, v[0:1]
-; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -1295,19 +1296,19 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1)
; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v1, s[4:5]
-; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v1, s[2:3]
+; GFX11-NEXT: global_load_u16 v0, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
-; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v2, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1353,18 +1354,18 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1)
; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v2, s1
-; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_load_ushort v3, v[0:1]
-; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
@@ -1393,19 +1394,19 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1)
; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5]
-; GFX11-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v0, v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
-; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v2, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1453,19 +1454,19 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1)
; VI-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dword v2, v[1:2]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v3, v[0:1]
-; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -1497,12 +1498,12 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v1, s[6:7]
-; GFX11-NEXT: global_load_u16 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1904,29 +1905,29 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half
; VI-LABEL: s_copysign_v3f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_movk_i32 s2, 0x7fff
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s6
-; VI-NEXT: s_lshr_b32 s3, s6, 16
+; VI-NEXT: s_lshr_b32 s1, s6, 16
; VI-NEXT: s_lshr_b32 s4, s4, 16
-; VI-NEXT: v_bfi_b32 v0, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v0, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_bfi_b32 v1, s2, v1, v2
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_bfi_b32 v1, s0, v1, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_bfi_b32 v3, s2, v0, v1
-; VI-NEXT: s_add_u32 s2, s0, 4
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: flat_store_short v[0:1], v3
+; VI-NEXT: v_bfi_b32 v3, s0, v0, v1
+; VI-NEXT: s_add_u32 s0, s2, 4
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_short v[0:1], v3
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1958,24 +1959,24 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s2, s6, 16
+; GFX11-NEXT: s_lshr_b32 s0, s6, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: s_lshr_b32 s2, s4, 16
+; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: s_lshr_b32 s0, s4, 16
; GFX11-NEXT: v_mov_b32_e32 v2, s7
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s4, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1
; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s5, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b16 v3, v2, s[0:1] offset:4
-; GFX11-NEXT: global_store_b32 v3, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v3, v2, s[2:3] offset:4
+; GFX11-NEXT: global_store_b32 v3, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2023,31 +2024,31 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half
; VI-LABEL: s_copysign_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_movk_i32 s2, 0x7fff
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: s_lshr_b32 s3, s7, 16
+; VI-NEXT: s_lshr_b32 s1, s7, 16
; VI-NEXT: s_lshr_b32 s5, s5, 16
-; VI-NEXT: v_bfi_b32 v0, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v0, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_bfi_b32 v1, s2, v1, v2
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_bfi_b32 v1, s0, v1, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_lshr_b32 s3, s6, 16
+; VI-NEXT: s_lshr_b32 s1, s6, 16
; VI-NEXT: s_lshr_b32 s4, s4, 16
-; VI-NEXT: v_bfi_b32 v0, s2, v0, v2
+; VI-NEXT: v_bfi_b32 v0, s0, v0, v2
; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_bfi_b32 v2, s2, v2, v3
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_bfi_b32 v2, s0, v2, v3
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -2085,26 +2086,26 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s7
; GFX11-NEXT: v_mov_b32_e32 v1, s6
-; GFX11-NEXT: s_lshr_b32 s2, s7, 16
+; GFX11-NEXT: s_lshr_b32 s0, s7, 16
; GFX11-NEXT: s_lshr_b32 s6, s6, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s6
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s6
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s5, v0
; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s4, v1
-; GFX11-NEXT: s_lshr_b32 s3, s5, 16
-; GFX11-NEXT: s_lshr_b32 s2, s4, 16
-; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s3, v2
-; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s2, v3
+; GFX11-NEXT: s_lshr_b32 s1, s5, 16
+; GFX11-NEXT: s_lshr_b32 s0, s4, 16
+; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s1, v2
+; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s0, v3
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v0
; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v4
-; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v5, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index fb04b66..3f5d90e 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -63,26 +63,26 @@ define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %m
;
; VI-LABEL: s_test_copysign_f32_0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset0_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x7fffffff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset0_b32 s2, 31
+; GFX11-NEXT: s_and_b32 s0, s4, 0x7fffffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -106,26 +106,26 @@ define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %m
;
; VI-LABEL: s_test_copysign_f32_1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset0_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x7fffffff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset0_b32 s2, 31
+; GFX11-NEXT: s_and_b32 s0, s4, 0x7fffffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -149,26 +149,26 @@ define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float
;
; VI-LABEL: s_test_copysign_f32_10.0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset0_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x7fffffff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_10.0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset0_b32 s2, 31
+; GFX11-NEXT: s_and_b32 s0, s4, 0x7fffffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -192,26 +192,26 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float
;
; VI-LABEL: s_test_copysign_f32_neg1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_or_b32 s0, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_neg1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset1_b32 s2, 31
+; GFX11-NEXT: s_or_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -235,26 +235,26 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, floa
;
; VI-LABEL: s_test_copysign_f32_neg10:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_or_b32 s0, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_neg10:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset1_b32 s2, 31
+; GFX11-NEXT: s_or_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -278,26 +278,26 @@ define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, floa
;
; VI-LABEL: s_test_copysign_f32_0_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_0_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -323,28 +323,28 @@ define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, floa
;
; VI-LABEL: s_test_copysign_f32_1_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x80000000
-; VI-NEXT: s_or_b32 s2, s2, 1.0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x80000000
+; VI-NEXT: s_or_b32 s0, s0, 1.0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_1_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 1.0
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 1.0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -369,28 +369,28 @@ define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, flo
;
; VI-LABEL: s_test_copysign_f32_10_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x80000000
-; VI-NEXT: s_or_b32 s2, s2, 0x41200000
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x80000000
+; VI-NEXT: s_or_b32 s0, s0, 0x41200000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_10_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 0x41200000
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -415,28 +415,28 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, f
;
; VI-LABEL: s_test_copysign_f32_neg1_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x80000000
-; VI-NEXT: s_or_b32 s2, s2, 1.0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x80000000
+; VI-NEXT: s_or_b32 s0, s0, 1.0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_neg1_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 1.0
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 1.0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -461,28 +461,28 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out,
;
; VI-LABEL: s_test_copysign_f32_neg10_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x80000000
-; VI-NEXT: s_or_b32 s2, s2, 0x41200000
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x80000000
+; VI-NEXT: s_or_b32 s0, s0, 0x41200000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_neg10_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 0x41200000
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -512,17 +512,17 @@ define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x flo
; VI-LABEL: s_test_copysign_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_brev_b32 s2, -2
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_bfi_b32 v1, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v1, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_bfi_b32 v0, s2, v2, v0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_bfi_b32 v0, s0, v2, v0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -530,14 +530,14 @@ define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x flo
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
; GFX11-NEXT: v_mov_b32_e32 v2, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v0
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v2
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -571,20 +571,20 @@ define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x flo
; VI-LABEL: s_test_copysign_v3f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_brev_b32 s2, -2
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_brev_b32 s7, -2
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s10
; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: v_bfi_b32 v2, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v2, s7, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s9
-; VI-NEXT: v_bfi_b32 v1, s2, v3, v0
+; VI-NEXT: v_bfi_b32 v1, s7, v3, v0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v3, s8
-; VI-NEXT: v_bfi_b32 v0, s2, v0, v3
-; VI-NEXT: v_mov_b32_e32 v4, s1
-; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_bfi_b32 v0, s7, v0, v3
+; VI-NEXT: v_mov_b32_e32 v4, s3
+; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-NEXT: s_endpgm
;
@@ -592,7 +592,7 @@ define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x flo
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v1, s9
@@ -602,7 +602,7 @@ define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x flo
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v3
-; GFX11-NEXT: global_store_b96 v4, v[0:2], s[0:1]
+; GFX11-NEXT: global_store_b96 v4, v[0:2], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -638,23 +638,23 @@ define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x flo
; VI-LABEL: s_test_copysign_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_brev_b32 s2, -2
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_brev_b32 s12, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mov_b32_e32 v1, s11
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_bfi_b32 v3, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v3, s12, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s10
-; VI-NEXT: v_bfi_b32 v2, s2, v2, v0
+; VI-NEXT: v_bfi_b32 v2, s12, v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: v_mov_b32_e32 v1, s9
-; VI-NEXT: v_bfi_b32 v1, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v1, s12, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v4, s8
-; VI-NEXT: v_bfi_b32 v0, s2, v0, v4
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_bfi_b32 v0, s12, v0, v4
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -662,7 +662,7 @@ define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x flo
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s11 :: v_dual_mov_b32 v1, s10
@@ -673,7 +673,7 @@ define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x flo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v4
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v5
-; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v6, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -921,16 +921,16 @@ define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out
;
; VI-LABEL: s_test_copysign_f32_fptrunc_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_brev_b32 s2, -2
-; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_bfi_b32 v2, s2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_bfi_b32 v2, s0, v0, v1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -940,12 +940,12 @@ define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -972,25 +972,25 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %o
;
; VI-LABEL: s_test_copysign_f32_1_fptrunc_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_and_b32 s0, s3, 0x80000000
+; VI-NEXT: s_and_b32 s0, s7, 0x80000000
; VI-NEXT: s_or_b32 s0, s0, 1.0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_1_fptrunc_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s7, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 1.0
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 1.0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1063,31 +1063,31 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out
;
; VI-LABEL: s_test_copysign_f32_1_fpext_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s2, s2, 16
-; VI-NEXT: s_and_b32 s2, s2, 0x80000000
-; VI-NEXT: s_or_b32 s2, s2, 1.0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_lshl_b32 s0, s4, 16
+; VI-NEXT: s_and_b32 s0, s0, 0x80000000
+; VI-NEXT: s_or_b32 s0, s0, 1.0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_1_fpext_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-NEXT: s_lshl_b32 s0, s4, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
-; GFX11-NEXT: s_or_b32 s2, s2, 1.0
+; GFX11-NEXT: s_and_b32 s0, s0, 0x80000000
+; GFX11-NEXT: s_or_b32 s0, s0, 1.0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
index b5fa3fd..5d5a4e9 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -29,15 +29,15 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32],
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_brev_b32 s4, -2
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_bfi_b32 v1, s4, v0, v1
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_bfi_b32 v1, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -46,14 +46,14 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32],
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x74
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[6:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, s3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -79,13 +79,13 @@ define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32
; VI-LABEL: s_test_copysign_f64_0:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset0_b32 s3, 31
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_and_b32 s0, s3, 0x7fffffff
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -93,13 +93,13 @@ define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset0_b32 s3, 31
+; GFX11-NEXT: s_and_b32 s0, s3, 0x7fffffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -125,13 +125,13 @@ define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32
; VI-LABEL: s_test_copysign_f64_1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset0_b32 s3, 31
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_and_b32 s0, s3, 0x7fffffff
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -139,13 +139,13 @@ define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset0_b32 s3, 31
+; GFX11-NEXT: s_and_b32 s0, s3, 0x7fffffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -171,13 +171,13 @@ define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i3
; VI-LABEL: s_test_copysign_f64_10:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset0_b32 s3, 31
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_and_b32 s0, s3, 0x7fffffff
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -185,13 +185,13 @@ define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i3
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset0_b32 s3, 31
+; GFX11-NEXT: s_and_b32 s0, s3, 0x7fffffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -217,13 +217,13 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x
; VI-LABEL: s_test_copysign_f64_neg1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s3, 31
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_or_b32 s0, s3, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -231,13 +231,13 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset1_b32 s3, 31
+; GFX11-NEXT: s_or_b32 s0, s3, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -263,13 +263,13 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x
; VI-LABEL: s_test_copysign_f64_neg10:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s3, 31
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_or_b32 s0, s3, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -277,13 +277,13 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset1_b32 s3, 31
+; GFX11-NEXT: s_or_b32 s0, s3, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -312,32 +312,32 @@ define amdgpu_kernel void @s_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i
; VI-LABEL: s_test_copysign_f64_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dword s4, s[0:1], 0x74
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_brev_b32 s5, -2
+; VI-NEXT: s_load_dword s6, s[0:1], 0x74
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_bfi_b32 v1, s5, v0, v1
+; VI-NEXT: v_mov_b32_e32 v1, s6
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_bfi_b32 v1, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f64_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x74
+; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x74
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: v_mov_b32_e32 v0, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -366,33 +366,33 @@ define amdgpu_kernel void @s_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i
;
; VI-LABEL: s_test_copysign_f64_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x74
+; VI-NEXT: s_load_dword s6, s[0:1], 0x74
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_brev_b32 s5, -2
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s4
+; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s6
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_bfi_b32 v1, s5, v1, v0
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_bfi_b32 v1, s0, v1, v0
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f64_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x74
+; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x74
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s4
+; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -419,24 +419,24 @@ define amdgpu_kernel void @s_test_copysign_f64_0_mag(ptr addrspace(1) %out, doub
;
; VI-LABEL: s_test_copysign_f64_0_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_and_b32 s0, s3, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_and_b32 s0, s7, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f64_0_mag:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s7, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -463,26 +463,26 @@ define amdgpu_kernel void @s_test_copysign_f64_1_mag(ptr addrspace(1) %out, doub
;
; VI-LABEL: s_test_copysign_f64_1_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_and_b32 s0, s3, 0x80000000
+; VI-NEXT: s_and_b32 s0, s7, 0x80000000
; VI-NEXT: s_or_b32 s0, s0, 0x3ff00000
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f64_1_mag:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s7, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 0x3ff00000
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 0x3ff00000
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -509,26 +509,26 @@ define amdgpu_kernel void @s_test_copysign_f64_10_mag(ptr addrspace(1) %out, dou
;
; VI-LABEL: s_test_copysign_f64_10_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_and_b32 s0, s3, 0x80000000
+; VI-NEXT: s_and_b32 s0, s7, 0x80000000
; VI-NEXT: s_or_b32 s0, s0, 0x40240000
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f64_10_mag:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s7, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 0x40240000
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 0x40240000
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -555,26 +555,26 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1_mag(ptr addrspace(1) %out, d
;
; VI-LABEL: s_test_copysign_f64_neg1_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_and_b32 s0, s3, 0x80000000
+; VI-NEXT: s_and_b32 s0, s7, 0x80000000
; VI-NEXT: s_or_b32 s0, s0, 0x3ff00000
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f64_neg1_mag:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s7, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 0x3ff00000
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 0x3ff00000
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -601,26 +601,26 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10_mag(ptr addrspace(1) %out,
;
; VI-LABEL: s_test_copysign_f64_neg10_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_and_b32 s0, s3, 0x80000000
+; VI-NEXT: s_and_b32 s0, s7, 0x80000000
; VI-NEXT: s_or_b32 s0, s0, 0x40240000
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f64_neg10_mag:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s7, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 0x40240000
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 0x40240000
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -652,19 +652,19 @@ define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x dou
; VI-LABEL: s_test_copysign_v2f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_brev_b32 s2, -2
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_brev_b32 s8, -2
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mov_b32_e32 v1, s11
; VI-NEXT: v_mov_b32_e32 v2, s5
-; VI-NEXT: v_bfi_b32 v3, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v3, s8, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s9
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_bfi_b32 v1, s2, v2, v0
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_bfi_b32 v1, s8, v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -672,7 +672,7 @@ define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x dou
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s11
; GFX11-NEXT: v_mov_b32_e32 v2, s9
@@ -681,7 +681,7 @@ define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x dou
; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s7, v1
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v2
; GFX11-NEXT: v_mov_b32_e32 v2, s6
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -718,28 +718,28 @@ define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x dou
; VI-LABEL: s_test_copysign_v3f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_brev_b32 s2, -2
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_brev_b32 s10, -2
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mov_b32_e32 v1, s15
+; VI-NEXT: s_add_u32 s0, s2, 16
; VI-NEXT: v_mov_b32_e32 v2, s5
-; VI-NEXT: v_bfi_b32 v3, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v3, s10, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s13
-; VI-NEXT: v_bfi_b32 v1, s2, v2, v0
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_bfi_b32 v1, s10, v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s9
; VI-NEXT: v_mov_b32_e32 v2, s17
-; VI-NEXT: v_bfi_b32 v5, s2, v0, v2
-; VI-NEXT: s_add_u32 s2, s0, 16
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: v_mov_b32_e32 v7, s1
+; VI-NEXT: v_bfi_b32 v5, s10, v0, v2
; VI-NEXT: v_mov_b32_e32 v4, s8
-; VI-NEXT: v_mov_b32_e32 v6, s2
+; VI-NEXT: v_mov_b32_e32 v6, s0
; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5]
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -747,7 +747,7 @@ define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x dou
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s15
; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v0, s4
@@ -758,8 +758,8 @@ define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x dou
; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s7, v1
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v7
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16
-; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b64 v6, v[4:5], s[2:3] offset:16
+; GFX11-NEXT: global_store_b128 v6, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -801,32 +801,32 @@ define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x dou
; VI-LABEL: s_test_copysign_v4f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_brev_b32 s2, -2
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_brev_b32 s12, -2
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mov_b32_e32 v1, s15
; VI-NEXT: v_mov_b32_e32 v2, s5
-; VI-NEXT: v_bfi_b32 v3, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v3, s12, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s13
-; VI-NEXT: v_bfi_b32 v1, s2, v2, v0
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: v_bfi_b32 v1, s12, v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: v_mov_b32_e32 v2, s19
-; VI-NEXT: v_bfi_b32 v7, s2, v0, v2
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_bfi_b32 v7, s12, v0, v2
; VI-NEXT: v_mov_b32_e32 v0, s9
; VI-NEXT: v_mov_b32_e32 v2, s17
-; VI-NEXT: v_bfi_b32 v5, s2, v0, v2
-; VI-NEXT: s_add_u32 s2, s0, 16
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v9, s3
+; VI-NEXT: v_mov_b32_e32 v9, s1
+; VI-NEXT: v_bfi_b32 v5, s12, v0, v2
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v6, s10
-; VI-NEXT: v_mov_b32_e32 v8, s2
+; VI-NEXT: v_mov_b32_e32 v8, s0
; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -834,7 +834,7 @@ define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x dou
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s15
; GFX11-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v2, s10
@@ -848,8 +848,8 @@ define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x dou
; GFX11-NEXT: v_mov_b32_e32 v6, s6
; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, s5, v5
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
-; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX11-NEXT: global_store_b128 v8, v[0:3], s[2:3] offset:16
+; GFX11-NEXT: global_store_b128 v8, v[4:7], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index b14b642..cfb608c 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -47,14 +47,14 @@ define amdgpu_kernel void @v_fdiv_f16(
; GFX8-LABEL: v_fdiv_f16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_ushort v5, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -111,12 +111,12 @@ define amdgpu_kernel void @v_fdiv_f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -178,52 +178,52 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #
;
; GFX8-LABEL: v_rcp_f16:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_rcp_f16_e32 v3, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_rcp_f16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rcp_f16_e32 v1, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rcp_f16:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rcp_f16_e32 v1, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rcp_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -272,52 +272,52 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) %
;
; GFX8-LABEL: v_rcp_f16_abs:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_rcp_f16_e64 v3, |v0|
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_rcp_f16_abs:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rcp_f16_e64 v1, |v1|
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rcp_f16_abs:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rcp_f16_e64 v1, |v1|
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rcp_f16_abs:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e64 v1, |v1|
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -369,52 +369,52 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs
;
; GFX8-LABEL: reciprocal_f16_rounded:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_rcp_f16_e32 v3, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: reciprocal_f16_rounded:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rcp_f16_e32 v1, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: reciprocal_f16_rounded:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rcp_f16_e32 v1, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: reciprocal_f16_rounded:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -450,52 +450,52 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) %
;
; GFX8-LABEL: v_rcp_f16_afn:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_rcp_f16_e32 v3, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_rcp_f16_afn:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rcp_f16_e32 v1, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rcp_f16_afn:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rcp_f16_e32 v1, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rcp_f16_afn:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -544,52 +544,52 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
;
; GFX8-LABEL: v_rcp_f16_neg:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_rcp_f16_e64 v3, -v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_rcp_f16_neg:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rcp_f16_e64 v1, -v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rcp_f16_neg:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rcp_f16_e64 v1, -v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rcp_f16_neg:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e64 v1, -v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -641,52 +641,52 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #
;
; GFX8-LABEL: v_rsq_f16:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_rsq_f16_e32 v3, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_rsq_f16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rsq_f16_e32 v1, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rsq_f16:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rsq_f16_e32 v1, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rsq_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -739,17 +739,17 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
;
; GFX8-LABEL: v_rsq_f16_neg:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_rsq_f16_e32 v3, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_xor_b32_e32 v2, 0x8000, v3
; GFX8-NEXT: flat_store_short v[0:1], v2
@@ -757,39 +757,39 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
;
; GFX9-LABEL: v_rsq_f16_neg:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rsq_f16_e32 v1, v1
; GFX9-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rsq_f16_neg:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rsq_f16_e32 v1, v1
; GFX10-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16_neg:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rsq_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -844,16 +844,16 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac
;
; GFX8-LABEL: v_rsq_f16_multi_use:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v3, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_rsq_f16_e32 v4, v3
; GFX8-NEXT: flat_store_short v[0:1], v3
@@ -863,41 +863,41 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac
;
; GFX9-LABEL: v_rsq_f16_multi_use:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rsq_f16_e32 v2, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_short v0, v2, s[0:1]
+; GFX9-NEXT: global_store_short v0, v2, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rsq_f16_multi_use:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rsq_f16_e32 v2, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-NEXT: global_store_short v0, v2, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16_multi_use:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rsq_f16_e32 v2, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] dlc
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v2, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -951,57 +951,57 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr
;
; GFX8-LABEL: v_rsq_f16_missing_contract0:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
; GFX8-NEXT: v_rcp_f16_e32 v3, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_rsq_f16_missing_contract0:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sqrt_f16_e32 v1, v1
; GFX9-NEXT: v_rcp_f16_e32 v1, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rsq_f16_missing_contract0:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sqrt_f16_e32 v1, v1
; GFX10-NEXT: v_rcp_f16_e32 v1, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16_missing_contract0:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1054,57 +1054,57 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr
;
; GFX8-LABEL: v_rsq_f16_missing_contract1:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
; GFX8-NEXT: v_rcp_f16_e32 v3, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_rsq_f16_missing_contract1:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sqrt_f16_e32 v1, v1
; GFX9-NEXT: v_rcp_f16_e32 v1, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rsq_f16_missing_contract1:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sqrt_f16_e32 v1, v1
; GFX10-NEXT: v_rcp_f16_e32 v1, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16_missing_contract1:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1157,57 +1157,57 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r,
;
; GFX8-LABEL: v_neg_rsq_f16_missing_contract1:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
; GFX8-NEXT: v_rcp_f16_e64 v3, -v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_neg_rsq_f16_missing_contract1:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sqrt_f16_e32 v1, v1
; GFX9-NEXT: v_rcp_f16_e64 v1, -v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_neg_rsq_f16_missing_contract1:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sqrt_f16_e32 v1, v1
; GFX10-NEXT: v_rcp_f16_e64 v1, -v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_neg_rsq_f16_missing_contract1:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_rcp_f16_e64 v1, -v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1251,14 +1251,14 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1)
; GFX8-LABEL: v_fdiv_f16_afn:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_ushort v5, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1307,12 +1307,12 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1)
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v2, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -1362,14 +1362,14 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace(
; GFX8-LABEL: v_fdiv_f16_unsafe:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_ushort v5, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1418,12 +1418,12 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v2, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -1463,46 +1463,46 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 {
; GFX8-LABEL: div_afn_2_x_pat_f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f16_e32 v2, 0.5, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: div_afn_2_x_pat_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v0, 0.5, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9-NEXT: global_store_short v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: div_afn_2_x_pat_f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_f16_e32 v0, 0.5, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_short v1, v0, s[0:1]
+; GFX10-NEXT: global_store_short v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: div_afn_2_x_pat_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_f16_e32 v0, 0.5, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1530,46 +1530,46 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 {
; GFX8-LABEL: div_afn_k_x_pat_f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f16_e32 v2, 0x2e66, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: div_afn_k_x_pat_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v0, 0x2e66, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9-NEXT: global_store_short v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: div_afn_k_x_pat_f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_f16_e32 v0, 0x2e66, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_short v1, v0, s[0:1]
+; GFX10-NEXT: global_store_short v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: div_afn_k_x_pat_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_f16_e32 v0, 0x2e66, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1597,46 +1597,46 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 {
; GFX8-LABEL: div_afn_neg_k_x_pat_f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f16_e32 v2, 0xae66, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: div_afn_neg_k_x_pat_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v0, 0xae66, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9-NEXT: global_store_short v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: div_afn_neg_k_x_pat_f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_f16_e32 v0, 0xae66, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_short v1, v0, s[0:1]
+; GFX10-NEXT: global_store_short v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: div_afn_neg_k_x_pat_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_f16_e32 v0, 0xae66, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index b639768..92db799 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -1077,7 +1077,6 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX8-LABEL: s_fdiv_v2f32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s5
; GFX8-NEXT: v_div_scale_f32 v1, s[2:3], s7, s7, v0
@@ -1097,6 +1096,7 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v5
; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: v_div_scale_f32 v3, vcc, s4, v3, s4
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_rcp_f32_e32 v5, v2
; GFX8-NEXT: v_div_fixup_f32 v1, v1, s7, v0
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
@@ -1108,8 +1108,9 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX8-NEXT: v_div_fmas_f32 v0, v2, v0, v5
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: v_div_fixup_f32 v0, v0, s6, v4
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
@@ -1120,7 +1121,6 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_scale_f32 v0, s2, s7, s7, s5
; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s5, s7, s5
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: v_rcp_f32_e32 v1, v0
; GFX10-NEXT: s_denorm_mode 15
; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0
@@ -1132,6 +1132,7 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX10-NEXT: s_denorm_mode 12
; GFX10-NEXT: v_div_scale_f32 v2, s2, s6, s6, s4
; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_rcp_f32_e32 v3, v2
; GFX10-NEXT: v_div_fixup_f32 v1, v0, s7, s5
; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s4, s6, s4
@@ -1147,14 +1148,14 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX10-NEXT: v_div_fmas_f32 v0, v0, v3, v4
; GFX10-NEXT: v_div_fixup_f32 v0, v0, s6, s4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_v2f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_div_scale_f32 v0, null, s7, s7, s5
; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s5, s7, s5
@@ -1185,7 +1186,7 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX11-NEXT: s_denorm_mode 12
; GFX11-NEXT: v_div_fmas_f32 v0, v0, v3, v4
; GFX11-NEXT: v_div_fixup_f32 v0, v0, s6, s4
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1227,12 +1228,12 @@ define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float>
; GFX8-LABEL: s_fdiv_ulp25_v2f32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_rcp_f32_e32 v0, s6
; GFX8-NEXT: v_rcp_f32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: v_mul_f32_e32 v0, s4, v0
; GFX8-NEXT: v_mul_f32_e32 v1, s5, v1
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -1256,14 +1257,14 @@ define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float>
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rcp_f32_e32 v0, s6
; GFX11-NEXT: v_rcp_f32_e32 v1, s7
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1305,14 +1306,14 @@ define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x fl
; GFX8-LABEL: s_fdiv_v2f32_fast_math:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_rcp_f32_e32 v0, s7
; GFX8-NEXT: v_rcp_f32_e32 v2, s6
; GFX8-NEXT: v_mul_f32_e32 v1, s5, v0
; GFX8-NEXT: v_mul_f32_e32 v0, s4, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -1334,14 +1335,14 @@ define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x fl
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rcp_f32_e32 v0, s7
; GFX11-NEXT: v_rcp_f32_e32 v2, s6
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_dual_mul_f32 v1, s5, v0 :: v_dual_mul_f32 v0, s4, v2
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1383,14 +1384,14 @@ define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x fl
; GFX8-LABEL: s_fdiv_v2f32_arcp_math:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_rcp_f32_e32 v0, s7
; GFX8-NEXT: v_rcp_f32_e32 v2, s6
; GFX8-NEXT: v_mul_f32_e32 v1, s5, v0
; GFX8-NEXT: v_mul_f32_e32 v0, s4, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -1412,14 +1413,14 @@ define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x fl
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rcp_f32_e32 v0, s7
; GFX11-NEXT: v_rcp_f32_e32 v2, s6
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_dual_mul_f32 v1, s5, v0 :: v_dual_mul_f32 v0, s4, v2
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2179,10 +2180,10 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac
; GFX8-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0
; GFX8-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_rcp_f32_e32 v2, v0
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX8-NEXT: v_fma_f32 v3, -v0, v2, 1.0
@@ -2194,18 +2195,19 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX8-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GFX8-NEXT: v_div_fixup_f32 v2, v0, s4, 1.0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s3, s2, s2, 1.0
-; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: v_div_scale_f32 v0, s2, s4, s4, 1.0
+; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_rcp_f32_e32 v1, v0
; GFX10-NEXT: s_denorm_mode 15
; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0
@@ -2217,19 +2219,19 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac
; GFX10-NEXT: s_denorm_mode 12
; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0
+; GFX10-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s2, 1.0
-; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0
+; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s4, 1.0
+; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0
; GFX11-NEXT: v_rcp_f32_e32 v1, v0
; GFX11-NEXT: s_denorm_mode 15
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -2242,8 +2244,8 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac
; GFX11-NEXT: s_denorm_mode 12
; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2331,10 +2333,10 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr
; GFX8-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0
; GFX8-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_rcp_f32_e32 v2, v0
; GFX8-NEXT: v_fma_f32 v3, -v0, v2, 1.0
; GFX8-NEXT: v_fma_f32 v2, v3, v2, v2
@@ -2344,52 +2346,53 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr
; GFX8-NEXT: v_fma_f32 v0, -v0, v3, v1
; GFX8-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GFX8-NEXT: v_div_fixup_f32 v2, v0, s4, 1.0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s3, s2, s2, 1.0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: v_div_scale_f32 v0, s2, s4, s4, 1.0
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_rcp_f32_e32 v1, v0
; GFX10-NEXT: v_fma_f32 v2, -v0, v1, 1.0
; GFX10-NEXT: v_fmac_f32_e32 v1, v2, v1
-; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0
+; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0
; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1
; GFX10-NEXT: v_fma_f32 v4, -v0, v3, v2
; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1
; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2
; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0
+; GFX10-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s2, 1.0
+; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s4, 1.0
; GFX11-NEXT: v_rcp_f32_e32 v1, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_fma_f32 v2, -v0, v1, 1.0
; GFX11-NEXT: v_fmac_f32_e32 v1, v2, v1
-; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0
+; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0
; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1
; GFX11-NEXT: v_fma_f32 v4, -v0, v3, v2
; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v1
; GFX11-NEXT: v_fma_f32 v0, -v0, v3, v2
; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
index c56b4ae..fede468 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
@@ -5,28 +5,28 @@
define amdgpu_kernel void @div_1_by_x_25ulp(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_1_by_x_25ulp:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v0, s2
+; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v0, s0
; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s2
+; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s0
; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v2
; GCN-DENORM-NEXT: v_ldexp_f32 v0, v0, v2
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_1_by_x_25ulp:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, s2
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, s0
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%div = fdiv float 1.000000e+00, %load, !fpmath !0
@@ -37,28 +37,28 @@ define amdgpu_kernel void @div_1_by_x_25ulp(ptr addrspace(1) %arg) {
define amdgpu_kernel void @div_minus_1_by_x_25ulp(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_minus_1_by_x_25ulp:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v0, -s2
+; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v0, -s0
; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s2
+; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s0
; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v2
; GCN-DENORM-NEXT: v_ldexp_f32 v0, v0, v2
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_minus_1_by_x_25ulp:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_rcp_f32_e64 v0, -s2
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: v_rcp_f32_e64 v0, -s0
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%div = fdiv float -1.000000e+00, %load, !fpmath !0
@@ -69,28 +69,28 @@ define amdgpu_kernel void @div_minus_1_by_x_25ulp(ptr addrspace(1) %arg) {
define amdgpu_kernel void @div_1_by_minus_x_25ulp(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_1_by_minus_x_25ulp:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v0, -s2
+; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v0, -s0
; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s2
+; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s0
; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v2
; GCN-DENORM-NEXT: v_ldexp_f32 v0, v0, v2
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_1_by_minus_x_25ulp:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_rcp_f32_e64 v0, -s2
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: v_rcp_f32_e64 v0, -s0
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%neg = fneg float %load
@@ -102,28 +102,28 @@ define amdgpu_kernel void @div_1_by_minus_x_25ulp(ptr addrspace(1) %arg) {
define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_minus_1_by_minus_x_25ulp:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v0, s2
+; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v0, s0
; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s2
+; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s0
; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v2
; GCN-DENORM-NEXT: v_ldexp_f32 v0, v0, v2
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_25ulp:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, s2
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, s0
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%neg = fsub float -0.000000e+00, %load
@@ -512,13 +512,13 @@ define amdgpu_kernel void @div_v_by_x_25ulp(ptr addrspace(1) %arg, float %num) {
define amdgpu_kernel void @div_1_by_x_fast(ptr addrspace(1) %arg) {
; GCN-LABEL: div_1_by_x_fast:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_rcp_f32_e32 v0, s2
-; GCN-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-NEXT: v_rcp_f32_e32 v0, s0
+; GCN-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%div = fdiv fast float 1.000000e+00, %load, !fpmath !0
@@ -529,25 +529,25 @@ define amdgpu_kernel void @div_1_by_x_fast(ptr addrspace(1) %arg) {
define amdgpu_kernel void @div_minus_1_by_x_fast(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_minus_1_by_x_fast:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_rcp_f32_e64 v0, -s2
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: v_rcp_f32_e64 v0, -s0
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_minus_1_by_x_fast:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, s2
+; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, s0
; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, 0x80000000, v0
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%div = fdiv fast float -1.000000e+00, %load, !fpmath !0
@@ -558,13 +558,13 @@ define amdgpu_kernel void @div_minus_1_by_x_fast(ptr addrspace(1) %arg) {
define amdgpu_kernel void @div_1_by_minus_x_fast(ptr addrspace(1) %arg) {
; GCN-LABEL: div_1_by_minus_x_fast:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_rcp_f32_e64 v0, -s2
-; GCN-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-NEXT: v_rcp_f32_e64 v0, -s0
+; GCN-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%neg = fneg float %load, !fpmath !0
@@ -576,25 +576,25 @@ define amdgpu_kernel void @div_1_by_minus_x_fast(ptr addrspace(1) %arg) {
define amdgpu_kernel void @div_minus_1_by_minus_x_fast(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_minus_1_by_minus_x_fast:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, s2
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, s0
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_fast:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_rcp_f32_e64 v0, -s2
+; GCN-FLUSH-NEXT: v_rcp_f32_e64 v0, -s0
; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, 0x80000000, v0
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%neg = fsub float -0.000000e+00, %load, !fpmath !0
@@ -606,11 +606,11 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_fast(ptr addrspace(1) %arg) {
define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_1_by_x_correctly_rounded:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0
+; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0
; GCN-DENORM-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0
; GCN-DENORM-NEXT: v_rcp_f32_e32 v2, v0
; GCN-DENORM-NEXT: v_fma_f32 v3, -v0, v2, 1.0
@@ -622,16 +622,16 @@ define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) {
; GCN-DENORM-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_1_by_x_correctly_rounded:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0
+; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0
; GCN-FLUSH-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0
; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, v0
; GCN-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
@@ -645,7 +645,7 @@ define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) {
; GCN-FLUSH-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%div = fdiv float 1.000000e+00, %load
@@ -656,11 +656,11 @@ define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) {
define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_minus_1_by_x_correctly_rounded:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, -1.0
+; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, -1.0
; GCN-DENORM-NEXT: v_div_scale_f32 v1, vcc, -1.0, s4, -1.0
; GCN-DENORM-NEXT: v_rcp_f32_e32 v2, v0
; GCN-DENORM-NEXT: v_fma_f32 v3, -v0, v2, 1.0
@@ -672,16 +672,16 @@ define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) %
; GCN-DENORM-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: v_div_fixup_f32 v0, v0, s4, -1.0
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_minus_1_by_x_correctly_rounded:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, -1.0
+; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, -1.0
; GCN-FLUSH-NEXT: v_div_scale_f32 v1, vcc, -1.0, s4, -1.0
; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, v0
; GCN-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
@@ -695,7 +695,7 @@ define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) %
; GCN-FLUSH-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: v_div_fixup_f32 v0, v0, s4, -1.0
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%div = fdiv float -1.000000e+00, %load
@@ -706,11 +706,11 @@ define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) %
define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_1_by_minus_x_correctly_rounded:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, -1.0
+; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, -1.0
; GCN-DENORM-NEXT: v_div_scale_f32 v1, vcc, -1.0, s4, -1.0
; GCN-DENORM-NEXT: v_rcp_f32_e32 v2, v0
; GCN-DENORM-NEXT: v_fma_f32 v3, -v0, v2, 1.0
@@ -722,16 +722,16 @@ define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) %
; GCN-DENORM-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: v_div_fixup_f32 v0, v0, s4, -1.0
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_1_by_minus_x_correctly_rounded:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[2:3], -s4, -s4, 1.0
+; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[0:1], -s4, -s4, 1.0
; GCN-FLUSH-NEXT: v_div_scale_f32 v1, vcc, 1.0, -s4, 1.0
; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, v0
; GCN-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
@@ -745,7 +745,7 @@ define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) %
; GCN-FLUSH-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: v_div_fixup_f32 v0, v0, -s4, 1.0
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%neg = fsub float -0.000000e+00, %load
@@ -757,11 +757,11 @@ define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) %
define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_minus_1_by_minus_x_correctly_rounded:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0
+; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0
; GCN-DENORM-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0
; GCN-DENORM-NEXT: v_rcp_f32_e32 v2, v0
; GCN-DENORM-NEXT: v_fma_f32 v3, -v0, v2, 1.0
@@ -773,16 +773,16 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(ptr addrspac
; GCN-DENORM-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_correctly_rounded:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[2:3], -s4, -s4, -1.0
+; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[0:1], -s4, -s4, -1.0
; GCN-FLUSH-NEXT: v_div_scale_f32 v1, vcc, -1.0, -s4, -1.0
; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, v0
; GCN-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
@@ -796,7 +796,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(ptr addrspac
; GCN-FLUSH-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: v_div_fixup_f32 v0, v0, -s4, -1.0
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%neg = fsub float -0.000000e+00, %load
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
index ab3650f..e0abaa6 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -4354,14 +4354,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3
; GCN2-LABEL: atomic_cmpxchg_i32_ret_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s2, s4, 16
-; GCN2-NEXT: s_addc_u32 s3, s5, 0
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
+; GCN2-NEXT: s_add_u32 s0, s4, 16
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s0
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -4606,12 +4606,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in,
; GCN2-LABEL: atomic_cmpxchg_i32_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -5263,31 +5263,31 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_i32_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 16
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 16
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i32_offset:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
@@ -5314,29 +5314,29 @@ define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_i32:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i32:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
@@ -5368,12 +5368,12 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64
;
; GCN2-LABEL: atomic_load_i32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -5381,8 +5381,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -5432,19 +5432,19 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index)
;
; GCN2-LABEL: atomic_load_i32_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -5673,31 +5673,31 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_f32_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 16
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 16
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_f32_offset:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
@@ -5724,29 +5724,29 @@ define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_f32:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_f32:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
@@ -5778,12 +5778,12 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64
;
; GCN2-LABEL: atomic_load_f32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -5791,8 +5791,8 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -5842,19 +5842,19 @@ define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index)
;
; GCN2-LABEL: atomic_load_f32_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -6083,31 +6083,31 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_i8_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 16
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 16
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_ubyte v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_byte v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i8_offset:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ubyte v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_byte v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
@@ -6134,29 +6134,29 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_i8:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_ubyte v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_byte v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i8:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ubyte v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_byte v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
@@ -6188,10 +6188,10 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 %
; GCN2-LABEL: atomic_load_i8_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s4, s0
-; GCN2-NEXT: s_addc_u32 s1, s5, s1
+; GCN2-NEXT: s_add_u32 s0, s4, s2
+; GCN2-NEXT: s_addc_u32 s1, s5, s3
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -6378,31 +6378,31 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_i16_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 16
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 16
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i16_offset:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
@@ -6429,29 +6429,29 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_i16:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i16:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
@@ -6483,12 +6483,12 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64
;
; GCN2-LABEL: atomic_load_i16_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 1
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 1
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -6496,8 +6496,8 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -7963,31 +7963,31 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_f16_offset:
; GCN2: ; %bb.0:
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 16
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 16
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_f16_offset:
; GCN3: ; %bb.0:
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
%gep = getelementptr half, ptr %in, i64 8
@@ -8013,29 +8013,29 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_f16:
; GCN2: ; %bb.0:
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_f16:
; GCN3: ; %bb.0:
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
%val = load atomic half, ptr %in seq_cst, align 2
@@ -8062,31 +8062,31 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_bf16_offset:
; GCN2: ; %bb.0:
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 16
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 16
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_bf16_offset:
; GCN3: ; %bb.0:
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
%gep = getelementptr bfloat, ptr %in, i64 8
@@ -8112,29 +8112,29 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_bf16:
; GCN2: ; %bb.0:
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_bf16:
; GCN3: ; %bb.0:
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
%val = load atomic bfloat, ptr %in seq_cst, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
index 816142d..1d204ac 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
@@ -3953,14 +3953,14 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_max_i32_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_ashr_i32 s7, s5, 31
-; GCN2-NEXT: s_mov_b32 s6, s5
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_ashr_i32 s1, s3, 31
+; GCN2-NEXT: s_mov_b32 s0, s3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -3971,7 +3971,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v3, v2
-; GCN2-NEXT: v_max_i32_e32 v2, s4, v3
+; GCN2-NEXT: v_max_i32_e32 v2, s2, v3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -3981,8 +3981,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_cbranch_execnz .LBB89_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -4152,14 +4152,14 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
;
; GCN2-LABEL: atomic_max_i32_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_ashr_i32 s7, s5, 31
-; GCN2-NEXT: s_mov_b32 s6, s5
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_ashr_i32 s1, s3, 31
+; GCN2-NEXT: s_mov_b32 s0, s3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1]
@@ -4168,7 +4168,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v3, v2
-; GCN2-NEXT: v_max_i32_e32 v2, s4, v3
+; GCN2-NEXT: v_max_i32_e32 v2, s2, v3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -4178,8 +4178,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
; GCN2-NEXT: s_cbranch_execnz .LBB91_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -5096,14 +5096,14 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2
;
; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_ashr_i32 s7, s5, 31
-; GCN2-NEXT: s_mov_b32 s6, s5
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_ashr_i32 s1, s3, 31
+; GCN2-NEXT: s_mov_b32 s0, s3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -5114,7 +5114,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v3, v2
-; GCN2-NEXT: v_max_u32_e32 v2, s4, v3
+; GCN2-NEXT: v_max_u32_e32 v2, s2, v3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -5124,8 +5124,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2
; GCN2-NEXT: s_cbranch_execnz .LBB103_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -5205,14 +5205,14 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %
;
; GCN2-LABEL: atomic_umax_i32_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_ashr_i32 s7, s5, 31
-; GCN2-NEXT: s_mov_b32 s6, s5
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_ashr_i32 s1, s3, 31
+; GCN2-NEXT: s_mov_b32 s0, s3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1]
@@ -5221,7 +5221,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v3, v2
-; GCN2-NEXT: v_max_u32_e32 v2, s4, v3
+; GCN2-NEXT: v_max_u32_e32 v2, s2, v3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -5231,8 +5231,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %
; GCN2-NEXT: s_cbranch_execnz .LBB104_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -6890,14 +6890,14 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_min_i32_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_ashr_i32 s7, s5, 31
-; GCN2-NEXT: s_mov_b32 s6, s5
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_ashr_i32 s1, s3, 31
+; GCN2-NEXT: s_mov_b32 s0, s3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -6908,7 +6908,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v3, v2
-; GCN2-NEXT: v_min_i32_e32 v2, s4, v3
+; GCN2-NEXT: v_min_i32_e32 v2, s2, v3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -6918,8 +6918,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_cbranch_execnz .LBB126_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -7076,14 +7076,14 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
;
; GCN2-LABEL: atomic_min_i32_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_ashr_i32 s7, s5, 31
-; GCN2-NEXT: s_mov_b32 s6, s5
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_ashr_i32 s1, s3, 31
+; GCN2-NEXT: s_mov_b32 s0, s3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1]
@@ -7092,7 +7092,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v3, v2
-; GCN2-NEXT: v_min_i32_e32 v2, s4, v3
+; GCN2-NEXT: v_min_i32_e32 v2, s2, v3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -7102,8 +7102,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
; GCN2-NEXT: s_cbranch_execnz .LBB128_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index b8c8d99..fa5a0db 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -21,13 +21,13 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_add_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -36,10 +36,10 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_add_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -72,20 +72,20 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_add_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
@@ -93,10 +93,10 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -133,10 +133,10 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-LABEL: atomic_add_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -152,12 +152,12 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-LABEL: atomic_add_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] offset:32
@@ -195,38 +195,38 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_add_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -253,12 +253,12 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_add_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -266,10 +266,10 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_add_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -300,12 +300,12 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_add_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -318,10 +318,10 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -355,10 +355,10 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_add_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -372,12 +372,12 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_add_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1]
@@ -412,36 +412,36 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_add_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -469,13 +469,13 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_and_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -484,10 +484,10 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_and_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -520,20 +520,20 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_and_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
@@ -541,10 +541,10 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -581,10 +581,10 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-LABEL: atomic_and_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -600,12 +600,12 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-LABEL: atomic_and_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] offset:32
@@ -643,38 +643,38 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_and_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -701,12 +701,12 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_and_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -714,10 +714,10 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_and_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -748,12 +748,12 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_and_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -766,10 +766,10 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -803,10 +803,10 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_and_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -820,12 +820,12 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_and_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1]
@@ -860,36 +860,36 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_and_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -917,13 +917,13 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_sub_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -932,10 +932,10 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_sub_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -968,20 +968,20 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_sub_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
@@ -989,10 +989,10 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1029,10 +1029,10 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-LABEL: atomic_sub_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -1048,12 +1048,12 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-LABEL: atomic_sub_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] offset:32
@@ -1091,38 +1091,38 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_sub_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -1149,12 +1149,12 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_sub_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -1162,10 +1162,10 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_sub_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1196,12 +1196,12 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_sub_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -1214,10 +1214,10 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1251,10 +1251,10 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_sub_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -1268,12 +1268,12 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_sub_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1]
@@ -1308,36 +1308,36 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_sub_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -1364,13 +1364,13 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_max_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -1378,10 +1378,10 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_max_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1414,19 +1414,19 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_max_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
@@ -1435,10 +1435,10 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1474,10 +1474,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-LABEL: atomic_max_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -1492,12 +1492,12 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-LABEL: atomic_max_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] offset:32
@@ -1535,38 +1535,38 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_max_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -1592,22 +1592,22 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_max_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1638,12 +1638,12 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_max_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v2, s6
@@ -1656,10 +1656,10 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1692,10 +1692,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_max_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -1708,12 +1708,12 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_max_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1]
@@ -1748,36 +1748,36 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_max_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -1804,13 +1804,13 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_umax_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -1818,10 +1818,10 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_umax_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1854,19 +1854,19 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %
;
; GCN2-LABEL: atomic_umax_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
@@ -1875,10 +1875,10 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1914,10 +1914,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
; GCN2-LABEL: atomic_umax_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -1932,12 +1932,12 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
; GFX12-LABEL: atomic_umax_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] offset:32
@@ -1975,38 +1975,38 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
;
; GCN2-LABEL: atomic_umax_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -2032,22 +2032,22 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_umax_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2078,12 +2078,12 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_umax_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v2, s6
@@ -2096,10 +2096,10 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2132,10 +2132,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_umax_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -2148,12 +2148,12 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_umax_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1]
@@ -2188,36 +2188,36 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
;
; GCN2-LABEL: atomic_umax_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -2244,13 +2244,13 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_min_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -2258,10 +2258,10 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_min_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2294,19 +2294,19 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_min_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
@@ -2315,10 +2315,10 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2354,10 +2354,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-LABEL: atomic_min_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -2372,12 +2372,12 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-LABEL: atomic_min_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] offset:32
@@ -2415,38 +2415,38 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_min_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -2472,22 +2472,22 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_min_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2518,12 +2518,12 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_min_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v2, s6
@@ -2536,10 +2536,10 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2572,10 +2572,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_min_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -2588,12 +2588,12 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_min_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1]
@@ -2628,36 +2628,36 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_min_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -2684,13 +2684,13 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_umin_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -2698,10 +2698,10 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_umin_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2734,19 +2734,19 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %
;
; GCN2-LABEL: atomic_umin_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
@@ -2755,10 +2755,10 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2794,10 +2794,10 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64
; GCN2-LABEL: atomic_umin_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -2812,12 +2812,12 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64
; GFX12-LABEL: atomic_umin_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] offset:32
@@ -2855,38 +2855,38 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2
;
; GCN2-LABEL: atomic_umin_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -2912,22 +2912,22 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_umin_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2958,12 +2958,12 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_umin_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v2, s6
@@ -2976,10 +2976,10 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -3012,10 +3012,10 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_umin_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -3028,12 +3028,12 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_umin_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1]
@@ -3068,36 +3068,36 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %
;
; GCN2-LABEL: atomic_umin_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -3125,13 +3125,13 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_or_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3140,10 +3140,10 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_or_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3176,20 +3176,20 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in
;
; GCN2-LABEL: atomic_or_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
@@ -3197,10 +3197,10 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3237,10 +3237,10 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i
; GCN2-LABEL: atomic_or_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -3256,12 +3256,12 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i
; GFX12-LABEL: atomic_or_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] offset:32
@@ -3299,38 +3299,38 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_or_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -3357,12 +3357,12 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_or_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -3370,10 +3370,10 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_or_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3404,12 +3404,12 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_or_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -3422,10 +3422,10 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3459,10 +3459,10 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GCN2-LABEL: atomic_or_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -3476,12 +3476,12 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GFX12-LABEL: atomic_or_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1]
@@ -3516,36 +3516,36 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in
;
; GCN2-LABEL: atomic_or_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -3573,13 +3573,13 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_xchg_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3588,10 +3588,10 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_xchg_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3620,13 +3620,13 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
;
; GCN2-LABEL: atomic_xchg_f64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3635,10 +3635,10 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
;
; GFX12-LABEL: atomic_xchg_f64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3667,13 +3667,13 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
;
; GCN2-LABEL: atomic_xchg_pointer_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3682,10 +3682,10 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
;
; GFX12-LABEL: atomic_xchg_pointer_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3718,20 +3718,20 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %
;
; GCN2-LABEL: atomic_xchg_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
@@ -3739,10 +3739,10 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3779,10 +3779,10 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64
; GCN2-LABEL: atomic_xchg_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -3798,12 +3798,12 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64
; GFX12-LABEL: atomic_xchg_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] offset:32
@@ -3841,38 +3841,38 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2
;
; GCN2-LABEL: atomic_xchg_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -3899,12 +3899,12 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_xchg_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -3912,10 +3912,10 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_xchg_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3946,12 +3946,12 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_xchg_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -3964,10 +3964,10 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4001,10 +4001,10 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_xchg_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -4018,12 +4018,12 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_xchg_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1]
@@ -4058,36 +4058,36 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %
;
; GCN2-LABEL: atomic_xchg_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -4115,13 +4115,13 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_xor_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4130,10 +4130,10 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_xor_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4166,20 +4166,20 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_xor_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
@@ -4187,10 +4187,10 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4227,10 +4227,10 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-LABEL: atomic_xor_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -4246,12 +4246,12 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-LABEL: atomic_xor_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] offset:32
@@ -4289,38 +4289,38 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_xor_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -4347,12 +4347,12 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_xor_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -4360,10 +4360,10 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_xor_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4394,12 +4394,12 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_xor_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -4412,10 +4412,10 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4449,10 +4449,10 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_xor_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -4466,12 +4466,12 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_xor_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1]
@@ -4506,36 +4506,36 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_xor_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -4564,26 +4564,26 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
@@ -4613,24 +4613,24 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4665,12 +4665,12 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64
;
; GCN2-LABEL: atomic_load_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -4678,20 +4678,20 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT
@@ -4728,31 +4728,31 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index)
;
; GCN2-LABEL: atomic_load_i64_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT
@@ -4783,23 +4783,23 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) {
;
; GCN2-LABEL: atomic_store_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: s_add_u32 s0, s2, 32
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: s_addc_u32 s1, s3, 0
+; GCN2-NEXT: s_add_u32 s0, s6, 32
+; GCN2-NEXT: s_addc_u32 s1, s7, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32
; GFX12-NEXT: s_endpgm
entry:
@@ -4822,21 +4822,21 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) {
;
; GCN2-LABEL: atomic_store_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -4865,10 +4865,10 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64
; GCN2-LABEL: atomic_store_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s6, s0
; GCN2-NEXT: s_addc_u32 s1, s7, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -4882,14 +4882,14 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64
; GFX12-LABEL: atomic_store_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32
; GFX12-NEXT: s_endpgm
entry:
@@ -4918,10 +4918,10 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index
; GCN2-LABEL: atomic_store_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s6, s0
; GCN2-NEXT: s_addc_u32 s1, s7, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -4933,14 +4933,14 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index
; GFX12-LABEL: atomic_store_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -4971,16 +4971,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
; GCN2-LABEL: atomic_cmpxchg_i64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s2, s4, 32
-; GCN2-NEXT: s_addc_u32 s3, s5, 0
-; GCN2-NEXT: v_mov_b32_e32 v5, s3
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v4, s2
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -4990,11 +4990,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5027,16 +5027,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol
; GCN2-LABEL: atomic_cmpxchg_i64_soffset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s2, s4, 0x11940
-; GCN2-NEXT: s_addc_u32 s3, s5, 0
-; GCN2-NEXT: v_mov_b32_e32 v5, s3
+; GCN2-NEXT: s_add_u32 s0, s4, 0x11940
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v4, s2
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -5046,11 +5046,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:72000
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5084,35 +5084,35 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6
;
; GCN2-LABEL: atomic_cmpxchg_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
+; GCN2-NEXT: v_mov_b32_e32 v2, s10
+; GCN2-NEXT: v_mov_b32_e32 v3, s11
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -5146,18 +5146,18 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i
;
; GCN2-LABEL: atomic_cmpxchg_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
+; GCN2-NEXT: v_mov_b32_e32 v2, s10
+; GCN2-NEXT: v_mov_b32_e32 v3, s11
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5166,12 +5166,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i
;
; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -5212,19 +5212,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o
; GCN2-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: s_add_u32 s0, s4, s2
-; GCN2-NEXT: s_addc_u32 s3, s5, s3
-; GCN2-NEXT: s_add_u32 s2, s0, 32
-; GCN2-NEXT: s_addc_u32 s3, s3, 0
-; GCN2-NEXT: v_mov_b32_e32 v5, s3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
+; GCN2-NEXT: s_add_u32 s0, s0, 32
+; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s8
; GCN2-NEXT: v_mov_b32_e32 v1, s9
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v4, s2
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -5237,13 +5237,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x44
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3]
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5279,14 +5279,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) {
; GCN2-LABEL: atomic_cmpxchg_i64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v4, s4
; GCN2-NEXT: v_mov_b32_e32 v5, s5
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -5296,11 +5296,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5331,33 +5331,33 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in,
;
; GCN2-LABEL: atomic_cmpxchg_i64_ret:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v4, s0
-; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
+; GCN2-NEXT: v_mov_b32_e32 v4, s4
+; GCN2-NEXT: v_mov_b32_e32 v5, s5
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
+; GCN2-NEXT: v_mov_b32_e32 v2, s10
+; GCN2-NEXT: v_mov_b32_e32 v3, s11
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpxchg_i64_ret:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -5388,16 +5388,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind
;
; GCN2-LABEL: atomic_cmpxchg_i64_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
+; GCN2-NEXT: v_mov_b32_e32 v2, s10
+; GCN2-NEXT: v_mov_b32_e32 v3, s11
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5406,12 +5406,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind
;
; GFX12-LABEL: atomic_cmpxchg_i64_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -5449,17 +5449,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6
; GCN2-LABEL: atomic_cmpxchg_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
-; GCN2-NEXT: s_add_u32 s2, s4, s2
-; GCN2-NEXT: s_addc_u32 s3, s5, s3
-; GCN2-NEXT: v_mov_b32_e32 v5, s3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s8
; GCN2-NEXT: v_mov_b32_e32 v1, s9
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v4, s2
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -5472,13 +5472,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x44
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3]
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5512,26 +5512,26 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_f64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
@@ -5561,24 +5561,24 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_f64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5613,12 +5613,12 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64
;
; GCN2-LABEL: atomic_load_f64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -5626,20 +5626,20 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT
@@ -5676,31 +5676,31 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index)
;
; GCN2-LABEL: atomic_load_f64_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT
@@ -5731,23 +5731,23 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) {
;
; GCN2-LABEL: atomic_store_f64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: s_add_u32 s0, s2, 32
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: s_addc_u32 s1, s3, 0
+; GCN2-NEXT: s_add_u32 s0, s6, 32
+; GCN2-NEXT: s_addc_u32 s1, s7, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_f64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32
; GFX12-NEXT: s_endpgm
entry:
@@ -5770,21 +5770,21 @@ define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) {
;
; GCN2-LABEL: atomic_store_f64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_f64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -5813,10 +5813,10 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out,
; GCN2-LABEL: atomic_store_f64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s6, s0
; GCN2-NEXT: s_addc_u32 s1, s7, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -5830,14 +5830,14 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out,
; GFX12-LABEL: atomic_store_f64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32
; GFX12-NEXT: s_endpgm
entry:
@@ -5866,10 +5866,10 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in
; GCN2-LABEL: atomic_store_f64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s6, s0
; GCN2-NEXT: s_addc_u32 s1, s7, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -5881,14 +5881,14 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in
; GFX12-LABEL: atomic_store_f64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -5915,13 +5915,13 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_inc_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5930,10 +5930,10 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_inc_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5966,20 +5966,20 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_inc_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
@@ -5987,10 +5987,10 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6027,10 +6027,10 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %
; GCN2-LABEL: atomic_inc_i64_incr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -6046,12 +6046,12 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %
; GFX12-LABEL: atomic_inc_i64_incr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32
@@ -6089,38 +6089,38 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_inc_i64_ret_incr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64_ret_incr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -6147,12 +6147,12 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_inc_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -6160,10 +6160,10 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_inc_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6194,12 +6194,12 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_inc_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -6212,10 +6212,10 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6249,10 +6249,10 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_inc_i64_incr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -6266,12 +6266,12 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_inc_i64_incr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1]
@@ -6306,36 +6306,36 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_inc_i64_ret_incr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64_ret_incr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -6363,13 +6363,13 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_dec_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -6378,10 +6378,10 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_dec_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6414,20 +6414,20 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_dec_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
@@ -6435,10 +6435,10 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6475,10 +6475,10 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
; GCN2-LABEL: atomic_dec_i64_decr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -6494,12 +6494,12 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
; GFX12-LABEL: atomic_dec_i64_decr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32
@@ -6537,38 +6537,38 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_dec_i64_ret_decr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64_ret_decr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -6595,12 +6595,12 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_dec_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -6608,10 +6608,10 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_dec_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6642,12 +6642,12 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_dec_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -6660,10 +6660,10 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6697,10 +6697,10 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_dec_i64_decr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -6714,12 +6714,12 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_dec_i64_decr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1]
@@ -6754,36 +6754,36 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_dec_i64_ret_decr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64_ret_decr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
index d812b4b..19601b1 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
@@ -4292,24 +4292,24 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
;
; GCN2-LABEL: atomic_max_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v6, s3
-; GCN2-NEXT: v_mov_b32_e32 v7, s2
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -4402,25 +4402,25 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_max_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v4, s5
-; GCN2-NEXT: v_mov_b32_e32 v5, s4
+; GCN2-NEXT: v_mov_b32_e32 v4, s9
+; GCN2-NEXT: v_mov_b32_e32 v5, s8
; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v9, v3
; GCN2-NEXT: v_mov_b32_e32 v8, v2
-; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -4432,30 +4432,30 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_cbranch_execnz .LBB89_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_max_i64_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT: s_add_u32 s0, s0, s6
-; GCN3-NEXT: s_addc_u32 s1, s1, s7
+; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN3-NEXT: s_add_u32 s0, s4, s0
+; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
; GCN3-NEXT: s_mov_b64 s[0:1], 0
-; GCN3-NEXT: v_mov_b32_e32 v4, s5
-; GCN3-NEXT: v_mov_b32_e32 v5, s4
+; GCN3-NEXT: v_mov_b32_e32 v4, s9
+; GCN3-NEXT: v_mov_b32_e32 v5, s8
; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v9, v3
; GCN3-NEXT: v_mov_b32_e32 v8, v2
-; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
@@ -4467,8 +4467,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN3-NEXT: s_cbranch_execnz .LBB89_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_endpgm
entry:
@@ -4514,22 +4514,22 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
;
; GCN2-LABEL: atomic_max_i64_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v6, s3
-; GCN2-NEXT: v_mov_b32_e32 v7, s2
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -4619,23 +4619,23 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_max_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v4, s5
-; GCN2-NEXT: v_mov_b32_e32 v5, s4
+; GCN2-NEXT: v_mov_b32_e32 v4, s9
+; GCN2-NEXT: v_mov_b32_e32 v5, s8
; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v9, v3
; GCN2-NEXT: v_mov_b32_e32 v8, v2
-; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -4647,30 +4647,30 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN2-NEXT: s_cbranch_execnz .LBB91_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_max_i64_ret_addr64:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT: s_add_u32 s0, s0, s6
-; GCN3-NEXT: s_addc_u32 s1, s1, s7
+; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN3-NEXT: s_add_u32 s0, s4, s0
+; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN3-NEXT: s_mov_b64 s[0:1], 0
-; GCN3-NEXT: v_mov_b32_e32 v4, s5
-; GCN3-NEXT: v_mov_b32_e32 v5, s4
+; GCN3-NEXT: v_mov_b32_e32 v4, s9
+; GCN3-NEXT: v_mov_b32_e32 v5, s8
; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v9, v3
; GCN3-NEXT: v_mov_b32_e32 v8, v2
-; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -4682,8 +4682,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN3-NEXT: s_cbranch_execnz .LBB91_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_endpgm
entry:
@@ -5674,24 +5674,24 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
;
; GCN2-LABEL: atomic_umax_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v6, s3
-; GCN2-NEXT: v_mov_b32_e32 v7, s2
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
+; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -5784,25 +5784,25 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
;
; GCN2-LABEL: atomic_umax_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v4, s5
-; GCN2-NEXT: v_mov_b32_e32 v5, s4
+; GCN2-NEXT: v_mov_b32_e32 v4, s9
+; GCN2-NEXT: v_mov_b32_e32 v5, s8
; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v9, v3
; GCN2-NEXT: v_mov_b32_e32 v8, v2
-; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -5814,30 +5814,30 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
; GCN2-NEXT: s_cbranch_execnz .LBB103_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umax_i64_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT: s_add_u32 s0, s0, s6
-; GCN3-NEXT: s_addc_u32 s1, s1, s7
+; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN3-NEXT: s_add_u32 s0, s4, s0
+; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
; GCN3-NEXT: s_mov_b64 s[0:1], 0
-; GCN3-NEXT: v_mov_b32_e32 v4, s5
-; GCN3-NEXT: v_mov_b32_e32 v5, s4
+; GCN3-NEXT: v_mov_b32_e32 v4, s9
+; GCN3-NEXT: v_mov_b32_e32 v5, s8
; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v9, v3
; GCN3-NEXT: v_mov_b32_e32 v8, v2
-; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
@@ -5849,8 +5849,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
; GCN3-NEXT: s_cbranch_execnz .LBB103_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_endpgm
entry:
@@ -5899,23 +5899,23 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
;
; GCN2-LABEL: atomic_umax_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v4, s5
-; GCN2-NEXT: v_mov_b32_e32 v5, s4
+; GCN2-NEXT: v_mov_b32_e32 v4, s9
+; GCN2-NEXT: v_mov_b32_e32 v5, s8
; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v9, v3
; GCN2-NEXT: v_mov_b32_e32 v8, v2
-; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -5927,30 +5927,30 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GCN2-NEXT: s_cbranch_execnz .LBB104_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umax_i64_ret_addr64:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT: s_add_u32 s0, s0, s6
-; GCN3-NEXT: s_addc_u32 s1, s1, s7
+; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN3-NEXT: s_add_u32 s0, s4, s0
+; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN3-NEXT: s_mov_b64 s[0:1], 0
-; GCN3-NEXT: v_mov_b32_e32 v4, s5
-; GCN3-NEXT: v_mov_b32_e32 v5, s4
+; GCN3-NEXT: v_mov_b32_e32 v4, s9
+; GCN3-NEXT: v_mov_b32_e32 v5, s8
; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v9, v3
; GCN3-NEXT: v_mov_b32_e32 v8, v2
-; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -5962,8 +5962,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GCN3-NEXT: s_cbranch_execnz .LBB104_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_endpgm
entry:
@@ -7898,24 +7898,24 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
;
; GCN2-LABEL: atomic_min_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v6, s3
-; GCN2-NEXT: v_mov_b32_e32 v7, s2
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
; GCN2-NEXT: .LBB125_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -8008,25 +8008,25 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_min_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v4, s5
-; GCN2-NEXT: v_mov_b32_e32 v5, s4
+; GCN2-NEXT: v_mov_b32_e32 v4, s9
+; GCN2-NEXT: v_mov_b32_e32 v5, s8
; GCN2-NEXT: .LBB126_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v9, v3
; GCN2-NEXT: v_mov_b32_e32 v8, v2
-; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -8038,30 +8038,30 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_cbranch_execnz .LBB126_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i64_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT: s_add_u32 s0, s0, s6
-; GCN3-NEXT: s_addc_u32 s1, s1, s7
+; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN3-NEXT: s_add_u32 s0, s4, s0
+; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
; GCN3-NEXT: s_mov_b64 s[0:1], 0
-; GCN3-NEXT: v_mov_b32_e32 v4, s5
-; GCN3-NEXT: v_mov_b32_e32 v5, s4
+; GCN3-NEXT: v_mov_b32_e32 v4, s9
+; GCN3-NEXT: v_mov_b32_e32 v5, s8
; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v9, v3
; GCN3-NEXT: v_mov_b32_e32 v8, v2
-; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
@@ -8073,8 +8073,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN3-NEXT: s_cbranch_execnz .LBB126_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_endpgm
entry:
@@ -8118,20 +8118,20 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_min_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN2-NEXT: s_mov_b64 s[0:1], 0
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v6, s3
-; GCN2-NEXT: v_mov_b32_e32 v7, s2
-; GCN2-NEXT: v_mov_b32_e32 v4, s0
+; GCN2-NEXT: v_mov_b32_e32 v4, s4
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
+; GCN2-NEXT: v_mov_b32_e32 v5, s5
; GCN2-NEXT: .LBB127_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -8139,29 +8139,29 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GCN2-NEXT: v_mov_b32_e32 v3, v1
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN2-NEXT: v_mov_b32_e32 v2, v0
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN2-NEXT: s_cbranch_execnz .LBB127_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i64:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v5, s1
-; GCN3-NEXT: v_mov_b32_e32 v6, s3
-; GCN3-NEXT: v_mov_b32_e32 v7, s2
-; GCN3-NEXT: v_mov_b32_e32 v4, s0
+; GCN3-NEXT: v_mov_b32_e32 v4, s4
+; GCN3-NEXT: v_mov_b32_e32 v6, s7
+; GCN3-NEXT: v_mov_b32_e32 v7, s6
+; GCN3-NEXT: v_mov_b32_e32 v5, s5
; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -8169,9 +8169,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GCN3-NEXT: v_mov_b32_e32 v3, v1
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v2, v0
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB127_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
@@ -8218,23 +8218,23 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_min_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v4, s5
-; GCN2-NEXT: v_mov_b32_e32 v5, s4
+; GCN2-NEXT: v_mov_b32_e32 v4, s9
+; GCN2-NEXT: v_mov_b32_e32 v5, s8
; GCN2-NEXT: .LBB128_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v9, v3
; GCN2-NEXT: v_mov_b32_e32 v8, v2
-; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -8246,30 +8246,30 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN2-NEXT: s_cbranch_execnz .LBB128_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i64_ret_addr64:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT: s_add_u32 s0, s0, s6
-; GCN3-NEXT: s_addc_u32 s1, s1, s7
+; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN3-NEXT: s_add_u32 s0, s4, s0
+; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN3-NEXT: s_mov_b64 s[0:1], 0
-; GCN3-NEXT: v_mov_b32_e32 v4, s5
-; GCN3-NEXT: v_mov_b32_e32 v5, s4
+; GCN3-NEXT: v_mov_b32_e32 v4, s9
+; GCN3-NEXT: v_mov_b32_e32 v5, s8
; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v9, v3
; GCN3-NEXT: v_mov_b32_e32 v8, v2
-; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -8281,8 +8281,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN3-NEXT: s_cbranch_execnz .LBB128_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
index 2a9a9ef..7bbbb6d 100644
--- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
@@ -948,12 +948,12 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -967,12 +967,12 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2
; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5]
@@ -1041,12 +1041,12 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1060,12 +1060,12 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2
; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5]
@@ -1133,12 +1133,12 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1153,12 +1153,12 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1226,12 +1226,12 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1246,12 +1246,12 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1319,12 +1319,12 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, 1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1339,12 +1339,12 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, v2
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1412,12 +1412,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, 1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1432,12 +1432,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, v2
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1505,12 +1505,12 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, -1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1525,12 +1525,12 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, -v2
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1598,12 +1598,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, -1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1618,12 +1618,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, -v2
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1691,12 +1691,12 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1711,12 +1711,12 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1784,12 +1784,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1804,12 +1804,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1877,12 +1877,12 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1897,12 +1897,12 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2
; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5]
@@ -1970,12 +1970,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1990,12 +1990,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2
; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5]
@@ -2081,13 +2081,13 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out,
;
; GFX11-NOFMA-LABEL: test_f32_interp:
; GFX11-NOFMA: ; %bb.0:
-; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NOFMA-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x2
-; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-NOFMA-NEXT: global_load_b32 v3, v0, s[2:3]
+; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[10:11]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[8:9]
+; GFX11-NOFMA-NEXT: global_load_b32 v3, v0, s[6:7]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(2)
; GFX11-NOFMA-NEXT: v_sub_f32_e32 v4, 1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
@@ -2095,26 +2095,26 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out,
; GFX11-NOFMA-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-NOFMA-NEXT: v_fmac_f32_e32 v2, v3, v1
-; GFX11-NOFMA-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NOFMA-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX11-NOFMA-NEXT: s_nop 0
; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NOFMA-NEXT: s_endpgm
;
; GFX11-FMA-LABEL: test_f32_interp:
; GFX11-FMA: ; %bb.0:
-; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x2
-; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[4:5]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v3, v0, s[2:3]
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[8:9]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[10:11]
+; GFX11-FMA-NEXT: global_load_b32 v3, v0, s[6:7]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-FMA-NEXT: v_fma_f32 v1, -v2, v1, v1
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FMA-NEXT: v_fmac_f32_e32 v1, v3, v2
-; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-FMA-NEXT: s_nop 0
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FMA-NEXT: s_endpgm
@@ -2165,13 +2165,13 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out,
;
; GFX11-NOFMA-LABEL: test_f64_interp:
; GFX11-NOFMA: ; %bb.0:
-; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NOFMA-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x2
-; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v8, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v8, s[4:5]
-; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v8, s[2:3]
+; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v8, s[10:11]
+; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v8, s[8:9]
+; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v8, s[6:7]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(2)
; GFX11-NOFMA-NEXT: v_add_f64 v[6:7], -v[0:1], 1.0
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
@@ -2179,26 +2179,26 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out,
; GFX11-NOFMA-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-NOFMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
-; GFX11-NOFMA-NEXT: global_store_b64 v8, v[0:1], s[0:1]
+; GFX11-NOFMA-NEXT: global_store_b64 v8, v[0:1], s[4:5]
; GFX11-NOFMA-NEXT: s_nop 0
; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NOFMA-NEXT: s_endpgm
;
; GFX11-FMA-LABEL: test_f64_interp:
; GFX11-FMA: ; %bb.0:
-; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-FMA-NEXT: v_mov_b32_e32 v6, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x2
-; GFX11-FMA-NEXT: global_load_b64 v[0:1], v6, s[4:5]
-; GFX11-FMA-NEXT: global_load_b64 v[2:3], v6, s[6:7]
-; GFX11-FMA-NEXT: global_load_b64 v[4:5], v6, s[2:3]
+; GFX11-FMA-NEXT: global_load_b64 v[0:1], v6, s[8:9]
+; GFX11-FMA-NEXT: global_load_b64 v[2:3], v6, s[10:11]
+; GFX11-FMA-NEXT: global_load_b64 v[4:5], v6, s[6:7]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], v[0:1]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1]
-; GFX11-FMA-NEXT: global_store_b64 v6, v[0:1], s[0:1]
+; GFX11-FMA-NEXT: global_store_b64 v6, v[0:1], s[4:5]
; GFX11-FMA-NEXT: s_nop 0
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FMA-NEXT: s_endpgm
@@ -2236,15 +2236,15 @@ define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr ad
;
; GFX11-LABEL: fma_neg_2.0_neg_a_b_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:4 glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_fmac_f32_e32 v2, 2.0, v1
-; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2282,15 +2282,15 @@ define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrsp
;
; GFX11-LABEL: fma_2.0_neg_a_b_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:4 glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_fmac_f32_e32 v2, -2.0, v1
-; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2333,19 +2333,19 @@ define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspac
;
; GFX11-LABEL: fma_neg_b_c_v4f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v12, 4, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: global_load_b128 v[0:3], v12, s[2:3] offset:16
-; GFX11-NEXT: global_load_b128 v[4:7], v12, s[2:3]
-; GFX11-NEXT: global_load_b128 v[8:11], v12, s[2:3] offset:48
+; GFX11-NEXT: global_load_b128 v[0:3], v12, s[6:7] offset:16
+; GFX11-NEXT: global_load_b128 v[4:7], v12, s[6:7]
+; GFX11-NEXT: global_load_b128 v[8:11], v12, s[6:7] offset:48
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_fma_f32 v3, v11, -v7, -v3
; GFX11-NEXT: v_fma_f32 v2, v10, -v6, -v2
; GFX11-NEXT: v_fma_f32 v1, v9, -v5, -v1
; GFX11-NEXT: v_fma_f32 v0, v8, -v4, -v0
-; GFX11-NEXT: global_store_b128 v12, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v12, v[0:3], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll
index 23eb730..36d917f 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll
@@ -37,92 +37,92 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmax3_olt_0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_max3_f32 v0, v0, v1, v2
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmax3_olt_0_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2
-; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmax3_olt_0_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_max3_f32 v0, v0, v1, v2
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -169,92 +169,92 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmax3_olt_1_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_max3_f32 v0, v2, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmax3_olt_1_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_max3_f32 v0, v2, v0, v1
-; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmax3_olt_1_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_max3_f32 v0, v2, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -304,96 +304,96 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmax3_olt_0_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_max_f16_e32 v1, v1, v1
; VI-NEXT: v_max_f16_e32 v0, v0, v1
; VI-NEXT: v_max_f16_e32 v1, v2, v2
; VI-NEXT: v_max_f16_e32 v0, v0, v1
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmax3_olt_0_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_max3_f16 v0, v0, v1, v2
-; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmax3_olt_0_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_max3_f16 v0, v0, v1, v2
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -444,96 +444,96 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmax3_olt_1_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_max_f16_e32 v1, v1, v1
; VI-NEXT: v_max_f16_e32 v0, v0, v1
; VI-NEXT: v_max_f16_e32 v1, v2, v2
; VI-NEXT: v_max_f16_e32 v0, v1, v0
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmax3_olt_1_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_max3_f16 v0, v2, v0, v1
-; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmax3_olt_1_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_max3_f16 v0, v2, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
index 01b2f20..35621f8 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
@@ -28,15 +28,15 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmax_legacy_uge_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_nlt_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -80,15 +80,15 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmax_legacy_oge_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_ge_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -132,15 +132,15 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmax_legacy_ugt_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_nle_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -184,15 +184,15 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmax_legacy_ogt_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
index 87ac95a..a8815c2 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
@@ -263,12 +263,12 @@ define amdgpu_kernel void @fmaximumi_f32_move_to_valu(ptr addrspace(1) %out, ptr
; GCN: ; %bb.0:
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scope:SCOPE_SYS
+; GCN-NEXT: global_load_b32 v2, v0, s[2:3] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_maximum_f32 v1, v1, v2
; GCN-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -287,12 +287,12 @@ define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr
; GCN: ; %bb.0:
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_load_u16 v1, v0, s[6:7] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: global_load_u16 v2, v0, s[0:1] scope:SCOPE_SYS
+; GCN-NEXT: global_load_u16 v2, v0, s[2:3] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_maximum_f16 v1, v1, v2
; GCN-NEXT: global_store_b16 v0, v1, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 764fb99..4543038 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -45,15 +45,15 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -63,16 +63,16 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -83,27 +83,27 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -156,15 +156,15 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt
;
; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -174,16 +174,16 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt
;
; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -194,27 +194,27 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt
;
; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -268,15 +268,15 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1)
;
; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -286,16 +286,16 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1)
;
; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -306,27 +306,27 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1)
;
; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -380,15 +380,15 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1)
;
; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -398,16 +398,16 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1)
;
; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -418,27 +418,27 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1)
;
; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -494,15 +494,15 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp
;
; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -513,17 +513,17 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp
;
; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -534,28 +534,28 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp
;
; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_max_f32_e32 v1, 4.0, v1
; GFX9-NEXT: v_min_f32_e32 v1, 2.0, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_maxmin_f32 v1, v1, 4.0, 2.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -617,15 +617,15 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1
;
; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -639,16 +639,16 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1
;
; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -663,50 +663,50 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1
;
; GFX9-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-SDAG-NEXT: v_max_f32_e32 v1, 2.0, v1
; GFX9-SDAG-NEXT: v_min_f32_e32 v2, 4.0, v1
-; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-GISEL-NEXT: v_max_f32_e32 v2, 2.0, v1
; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_max_f32_e32 v1, 2.0, v1
; GFX11-SDAG-NEXT: v_min_f32_e32 v2, 4.0, v1
-; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[0:1] dlc
+; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[4:5] dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5] dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -714,18 +714,18 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1
;
; GFX11-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v2, v1, 2.0, 4.0
; GFX11-GISEL-NEXT: v_max_f32_e32 v1, 2.0, v1
-; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] dlc
+; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[4:5] dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5] dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -783,15 +783,15 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add
;
; VI-SDAG-LABEL: v_test_fmed3_r_i_i_f64:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
@@ -802,16 +802,16 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add
;
; VI-GISEL-LABEL: v_test_fmed3_r_i_i_f64:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v4
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -823,29 +823,29 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add
;
; GFX9-LABEL: v_test_fmed3_r_i_i_f64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 2.0
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_fmed3_r_i_i_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], 2.0
; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -897,15 +897,15 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out,
;
; VI-SDAG-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_med3_f32 v2, v3, 2.0, 4.0
@@ -914,16 +914,16 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out,
;
; VI-GISEL-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -933,24 +933,24 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1004,15 +1004,15 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out,
;
; VI-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -1022,16 +1022,16 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out,
;
; VI-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1045,52 +1045,52 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out,
;
; GFX9-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-GISEL-NEXT: v_cmp_nlt_f32_e32 vcc, 2.0, v1
; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 2.0, vcc
; GFX9-GISEL-NEXT: v_cmp_ngt_f32_e32 vcc, 4.0, v1
; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -1098,7 +1098,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out,
; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 2.0, vcc_lo
; GFX11-GISEL-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 4.0, v1
; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1170,17 +1170,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -1188,8 +1188,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, -v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -1197,19 +1197,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -1218,8 +1218,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7
@@ -1229,67 +1229,67 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
;
; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1360,17 +1360,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -1378,8 +1378,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, -v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -1387,19 +1387,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -1408,8 +1408,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v2, -1.0, v2
@@ -1419,67 +1419,67 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
;
; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, -v2, v3
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v2, -v2, -v2
; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, -v2, v3
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v2, -v2, -v2
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1550,17 +1550,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -1568,8 +1568,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, -v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -1577,19 +1577,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -1598,8 +1598,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v3, -1.0, v3
@@ -1609,67 +1609,67 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
;
; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, -v3
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v3, -v3, -v3
; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, -v3
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v3, -v3, -v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1741,17 +1741,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -1759,8 +1759,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, -v7, |v2|, -|v3|
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -1768,19 +1768,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -1789,8 +1789,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7
@@ -1801,69 +1801,69 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
;
; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_med3_f32 v1, -v1, |v2|, -|v3|
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX9-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3|
; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, |v2|, v3
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_med3_f32 v1, -v1, |v2|, -|v3|
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX11-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3|
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, |v2|, v3
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1942,17 +1942,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -1960,8 +1960,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, -|v7|, -|v2|, -|v3|
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -1969,19 +1969,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -1990,8 +1990,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e64 v4, -1.0, |v7|
@@ -2003,71 +2003,71 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
;
; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_med3_f32 v1, -|v1|, -|v2|, -|v3|
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -|v1|, -|v1|
; GFX9-GISEL-NEXT: v_max_f32_e64 v2, -|v2|, -|v2|
; GFX9-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3|
; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_med3_f32 v1, -|v1|, -|v2|, -|v3|
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -|v1|, -|v1|
; GFX11-GISEL-NEXT: v_max_f32_e64 v2, -|v2|, -|v2|
; GFX11-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3|
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -2151,17 +2151,17 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt
;
; VI-SDAG-LABEL: v_nnan_inputs_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -2169,8 +2169,8 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_add_f32_e32 v4, 1.0, v7
; VI-SDAG-NEXT: v_add_f32_e32 v2, 2.0, v2
@@ -2181,19 +2181,19 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt
;
; VI-GISEL-LABEL: v_nnan_inputs_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -2202,8 +2202,8 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_add_f32_e32 v4, 1.0, v7
@@ -2215,38 +2215,38 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt
;
; GFX9-LABEL: v_nnan_inputs_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2
; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_nnan_inputs_med3_f32_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2
; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2320,17 +2320,17 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou
;
; VI-SDAG-LABEL: v_nnan_input_calls_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -2338,8 +2338,8 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -2347,19 +2347,19 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou
;
; VI-GISEL-LABEL: v_nnan_input_calls_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -2368,8 +2368,8 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -2378,32 +2378,32 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_nnan_input_calls_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_nnan_input_calls_med3_f32_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2472,17 +2472,17 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; VI-SDAG-LABEL: v_nnan_call_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -2490,8 +2490,8 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -2499,19 +2499,19 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; VI-GISEL-LABEL: v_nnan_call_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -2520,8 +2520,8 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -2530,32 +2530,32 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_nnan_call_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_nnan_call_med3_f32_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2624,17 +2624,17 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; VI-SDAG-LABEL: v_fast_call_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -2642,8 +2642,8 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -2651,19 +2651,19 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; VI-GISEL-LABEL: v_fast_call_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -2672,8 +2672,8 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -2682,32 +2682,32 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_fast_call_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_fast_call_med3_f32_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2788,17 +2788,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -2806,8 +2806,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -2815,19 +2815,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -2836,8 +2836,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -2846,32 +2846,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2940,17 +2940,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -2958,8 +2958,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -2967,19 +2967,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -2988,8 +2988,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -2998,32 +2998,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3093,17 +3093,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -3111,8 +3111,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, -v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -3120,19 +3120,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -3141,8 +3141,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7
@@ -3152,67 +3152,67 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
;
; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -3282,17 +3282,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat2:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -3300,8 +3300,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -3309,19 +3309,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat2:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -3330,8 +3330,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -3340,32 +3340,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat2:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3434,17 +3434,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat3:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -3452,8 +3452,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -3461,19 +3461,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat3:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -3482,8 +3482,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -3492,32 +3492,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat3:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat3:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3586,17 +3586,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat4:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -3604,8 +3604,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -3613,19 +3613,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat4:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -3634,8 +3634,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -3644,32 +3644,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat4:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3738,17 +3738,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat5:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -3756,8 +3756,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -3765,19 +3765,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat5:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -3786,8 +3786,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -3796,32 +3796,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat5:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat5:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3890,17 +3890,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat6:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -3908,8 +3908,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -3917,19 +3917,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat6:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -3938,8 +3938,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -3948,32 +3948,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat6:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat6:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4042,17 +4042,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat7:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -4060,8 +4060,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -4069,19 +4069,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat7:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -4090,8 +4090,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -4100,32 +4100,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat7:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat7:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4194,17 +4194,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat8:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -4212,8 +4212,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -4221,19 +4221,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat8:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -4242,8 +4242,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -4252,32 +4252,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat8:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat8:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4346,17 +4346,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat9:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -4364,8 +4364,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -4373,19 +4373,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat9:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -4394,8 +4394,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -4404,32 +4404,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat9:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat9:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4498,17 +4498,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat10:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -4516,8 +4516,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -4525,19 +4525,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat10:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -4546,8 +4546,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -4556,32 +4556,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat10:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat10:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4650,17 +4650,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat11:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -4668,8 +4668,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -4677,19 +4677,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat11:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -4698,8 +4698,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -4708,32 +4708,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat11:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat11:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4802,17 +4802,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat12:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -4820,8 +4820,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -4829,19 +4829,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat12:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -4850,8 +4850,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -4860,32 +4860,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat12:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat12:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4954,17 +4954,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat13:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -4972,8 +4972,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -4981,19 +4981,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat13:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -5002,8 +5002,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -5012,32 +5012,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat13:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat13:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -5106,17 +5106,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat14:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -5124,8 +5124,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -5133,19 +5133,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat14:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -5154,8 +5154,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -5164,32 +5164,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat14:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat14:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -5258,17 +5258,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat15:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -5276,8 +5276,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -5285,19 +5285,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat15:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -5306,8 +5306,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -5316,32 +5316,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat15:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat15:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -5413,17 +5413,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat16:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -5431,8 +5431,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -5440,19 +5440,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat16:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -5461,8 +5461,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -5471,32 +5471,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -5588,17 +5588,17 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
;
; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -5606,8 +5606,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -5623,19 +5623,19 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
;
; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -5644,8 +5644,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
@@ -5662,14 +5662,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
;
; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
@@ -5680,19 +5680,19 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
; GFX9-NEXT: v_max_f32_e32 v1, v4, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -5701,7 +5701,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v4
; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -5790,17 +5790,17 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
;
; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -5808,8 +5808,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -5825,19 +5825,19 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
;
; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -5846,8 +5846,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
@@ -5864,14 +5864,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
;
; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
@@ -5882,19 +5882,19 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
; GFX9-NEXT: v_max_f32_e32 v1, v4, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5904,21 +5904,21 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
; GFX11-SDAG-NEXT: v_minmax_f32 v1, v1, v2, v3
; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v4, off dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -5927,7 +5927,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
; GFX11-GISEL-NEXT: v_minmax_f32 v2, v1, v2, v4
; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v1, off dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -6016,17 +6016,17 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
;
; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -6034,8 +6034,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -6051,19 +6051,19 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
;
; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -6072,8 +6072,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
@@ -6090,14 +6090,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
;
; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
@@ -6108,19 +6108,19 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
; GFX9-NEXT: global_store_dword v[0:1], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v4, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
; GFX11-NEXT: v_max_f32_e32 v3, v3, v3
@@ -6129,7 +6129,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v3
; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -6210,26 +6210,26 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v4
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v4
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
; VI-SDAG-NEXT: flat_load_dword v6, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v4
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s10, v4
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v4
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v6
; VI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -6243,19 +6243,19 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
@@ -6264,8 +6264,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
@@ -6280,14 +6280,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_test_safe_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
@@ -6296,47 +6296,47 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
; GFX9-NEXT: v_max_f32_e32 v2, v3, v3
; GFX9-NEXT: v_min_f32_e32 v1, v1, v2
; GFX9-NEXT: v_max_f32_e32 v1, v4, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
; GFX11-SDAG-NEXT: v_max_f32_e32 v3, v3, v3
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_maxmin_f32 v3, v1, v2, v3
; GFX11-SDAG-NEXT: v_minmax_f32 v1, v1, v2, v3
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_safe_med3_f32_pat0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3
; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v4
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -6411,17 +6411,17 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
;
; VI-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -6429,8 +6429,8 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_add_f32_e32 v4, 1.0, v7
; VI-SDAG-NEXT: v_add_f32_e32 v2, 2.0, v2
@@ -6441,19 +6441,19 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
;
; VI-GISEL-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -6462,8 +6462,8 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_add_f32_e32 v4, 1.0, v7
@@ -6475,38 +6475,38 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
;
; GFX9-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2
; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2
; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -6586,17 +6586,17 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
;
; VI-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -6604,8 +6604,8 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_add_f32_e32 v4, 1.0, v7
; VI-SDAG-NEXT: v_add_f32_e32 v2, 2.0, v2
@@ -6616,19 +6616,19 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
;
; VI-GISEL-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -6637,8 +6637,8 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_add_f32_e32 v4, 1.0, v7
@@ -6650,38 +6650,38 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
;
; GFX9-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2
; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2
; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -6761,17 +6761,17 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
;
; VI-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -6779,8 +6779,8 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_add_f32_e32 v4, 1.0, v7
; VI-SDAG-NEXT: v_add_f32_e32 v2, 2.0, v2
@@ -6791,19 +6791,19 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
;
; VI-GISEL-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -6812,8 +6812,8 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_add_f32_e32 v4, 1.0, v7
@@ -6825,38 +6825,38 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
;
; GFX9-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2
; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2
; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -6931,17 +6931,17 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa
;
; VI-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -6949,8 +6949,8 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, -v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -6958,19 +6958,19 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa
;
; VI-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -6979,8 +6979,8 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7
@@ -6990,67 +6990,67 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa
;
; GFX9-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -7126,26 +7126,26 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v4
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v4
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
; VI-SDAG-NEXT: flat_load_dword v6, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v4
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s10, v4
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v4
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_min_f32_e64 v4, -v6, v2
; VI-SDAG-NEXT: v_max_f32_e32 v2, v6, v2
@@ -7156,19 +7156,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
@@ -7177,8 +7177,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7
@@ -7191,77 +7191,77 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
;
; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_min_f32_e64 v4, -v1, v2
; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v1, v2
; GFX9-SDAG-NEXT: v_min_f32_e32 v1, v1, v3
; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v4, v1
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v4, -v1, -v1
; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
; GFX9-GISEL-NEXT: v_min_f32_e32 v2, v4, v2
; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v3
; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_maxmin_f32 v3, v1, v2, v3
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_minmax_f32 v1, -v1, v2, v3
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v4, -v1, -v1
; GFX11-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v4, v2
; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v3, v4
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -7334,17 +7334,17 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
;
; VI-SDAG-LABEL: v_test_global_nnans_min_max_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -7352,8 +7352,8 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_max_f32_e32 v2, v7, v2
; VI-SDAG-NEXT: v_min_f32_e32 v2, v2, v3
@@ -7362,19 +7362,19 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
;
; VI-GISEL-LABEL: v_test_global_nnans_min_max_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -7383,8 +7383,8 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_max_f32_e32 v2, v7, v2
@@ -7394,33 +7394,33 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
;
; GFX9-LABEL: v_test_global_nnans_min_max_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v1, v2
; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_min_max_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_maxmin_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -7487,15 +7487,15 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_ushort v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f16_e32 v2, 1.0, v3
@@ -7506,17 +7506,17 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_ushort v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_add_f16_e32 v2, 1.0, v3
@@ -7527,27 +7527,27 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX9-NEXT: v_med3_f16 v1, v1, 2.0, 4.0
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f16 v1, v1, 2.0, 4.0
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -7644,17 +7644,17 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
;
; VI-SDAG-LABEL: v_nnan_inputs_med3_f16_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 1, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_ushort v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -7662,8 +7662,8 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_ushort v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_add_f16_e32 v4, 1.0, v7
; VI-SDAG-NEXT: v_add_f16_e32 v2, 2.0, v2
@@ -7677,19 +7677,19 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
;
; VI-GISEL-LABEL: v_nnan_inputs_med3_f16_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 1, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_ushort v7, v[0:1] glc
@@ -7698,8 +7698,8 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_ushort v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_add_f16_e32 v4, 1.0, v7
@@ -7714,39 +7714,39 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
;
; GFX9-LABEL: v_nnan_inputs_med3_f16_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_ushort v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_ushort v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_ushort v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX9-NEXT: v_add_f16_e32 v2, 2.0, v2
; GFX9-NEXT: v_add_f16_e32 v3, 4.0, v3
; GFX9-NEXT: v_med3_f16 v1, v1, v2, v3
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_nnan_inputs_med3_f16_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_u16 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_u16 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX11-NEXT: v_add_f16_e32 v2, 2.0, v2
; GFX11-NEXT: v_add_f16_e32 v3, 4.0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f16 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -7810,15 +7810,15 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad
;
; VI-SDAG-LABEL: two_non_inline_constant:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 0.5, v3
@@ -7829,17 +7829,17 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad
;
; VI-GISEL-LABEL: two_non_inline_constant:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_add_f32_e32 v2, 0.5, v3
@@ -7850,45 +7850,45 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad
;
; GFX9-LABEL: two_non_inline_constant:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX9-NEXT: v_max_f32_e32 v1, 0x41000000, v1
; GFX9-NEXT: v_min_f32_e32 v1, 0x41800000, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: two_non_inline_constant:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0x41000000
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT: s_mov_b32 s2, 0x41000000
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_maxmin_f32 v1, v1, s2, 0x41800000
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_maxmin_f32 v1, v1, s0, 0x41800000
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: two_non_inline_constant:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x41800000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_maxmin_f32 v1, v1, 0x41000000, v2
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -7952,16 +7952,16 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad
;
; VI-SDAG-LABEL: one_non_inline_constant:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41800000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 0.5, v3
@@ -7974,17 +7974,17 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad
;
; VI-GISEL-LABEL: one_non_inline_constant:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x41800000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -7998,32 +7998,32 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad
;
; GFX9-LABEL: one_non_inline_constant:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0x41800000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v3, 0.5, v1
; GFX9-NEXT: v_add_f32_e32 v1, 0x41800000, v1
; GFX9-NEXT: v_med3_f32 v2, v3, 1.0, v2
-; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-NEXT: global_store_dword v[0:1], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: one_non_inline_constant:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v2, 0.5, v1
; GFX11-NEXT: v_add_f32_e32 v1, 0x41800000, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_med3_f32 v2, v2, 1.0, 0x41800000
-; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
@@ -8099,21 +8099,21 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: two_non_inline_constant_multi_use:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT: s_mov_b32 s0, 0x41000000
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41800000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: s_mov_b32 s2, 0x41000000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 0.5, v3
-; VI-SDAG-NEXT: v_med3_f32 v2, v2, s2, v4
+; VI-SDAG-NEXT: v_med3_f32 v2, v2, s0, v4
; VI-SDAG-NEXT: v_add_f32_e32 v5, 0x41800000, v3
; VI-SDAG-NEXT: v_add_f32_e32 v3, 0x41000000, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -8125,18 +8125,18 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: two_non_inline_constant_multi_use:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x41000000
; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x41800000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -8153,18 +8153,18 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
;
; GFX9-SDAG-LABEL: two_non_inline_constant_multi_use:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_mov_b32 s0, 0x41000000
; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x41800000
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT: s_mov_b32 s2, 0x41000000
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_add_f32_e32 v3, 0.5, v1
; GFX9-SDAG-NEXT: v_add_f32_e32 v4, 0x41800000, v1
; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 0x41000000, v1
-; GFX9-SDAG-NEXT: v_med3_f32 v2, v3, s2, v2
-; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-SDAG-NEXT: v_med3_f32 v2, v3, s0, v2
+; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-SDAG-NEXT: global_store_dword v[0:1], v4, off
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: global_store_dword v[0:1], v1, off
@@ -8173,18 +8173,18 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
;
; GFX9-GISEL-LABEL: two_non_inline_constant_multi_use:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x41000000
; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x41800000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_add_f32_e32 v4, 0.5, v1
; GFX9-GISEL-NEXT: v_add_f32_e32 v5, 0x41800000, v1
; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 0x41000000, v1
; GFX9-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3
-; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-GISEL-NEXT: global_store_dword v[0:1], v5, off
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: global_store_dword v[0:1], v1, off
@@ -8193,18 +8193,18 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
;
; GFX11-SDAG-LABEL: two_non_inline_constant_multi_use:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0x41000000
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT: s_mov_b32 s2, 0x41000000
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_f32_e32 v3, 0x41800000, v1
; GFX11-SDAG-NEXT: v_add_f32_e32 v2, 0.5, v1
; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 0x41000000, v1
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_med3_f32 v2, v2, s2, 0x41800000
-; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-SDAG-NEXT: v_med3_f32 v2, v2, s0, 0x41800000
+; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v3, off dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v1, off dlc
@@ -8215,17 +8215,17 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
;
; GFX11-GISEL-LABEL: two_non_inline_constant_multi_use:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0x41800000 :: v_dual_add_f32 v3, 0.5, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v2, v3, 0x41000000, v2
; GFX11-GISEL-NEXT: v_add_f32_e32 v3, 0x41800000, v1
; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 0x41000000, v1
-; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v3, off dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v1, off dlc
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 7337d90..2d17955 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -37,92 +37,92 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmin3_olt_0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_min3_f32 v0, v0, v1, v2
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmin3_olt_0_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_min3_f32 v0, v0, v1, v2
-; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmin3_olt_0_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_min3_f32 v0, v0, v1, v2
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -169,92 +169,92 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmin3_olt_1_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_min3_f32 v0, v2, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmin3_olt_1_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_min3_f32 v0, v2, v0, v1
-; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmin3_olt_1_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_min3_f32 v0, v2, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -304,96 +304,96 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmin3_olt_0_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_max_f16_e32 v1, v1, v1
; VI-NEXT: v_min_f16_e32 v0, v0, v1
; VI-NEXT: v_max_f16_e32 v1, v2, v2
; VI-NEXT: v_min_f16_e32 v0, v0, v1
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmin3_olt_0_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_min3_f16 v0, v0, v1, v2
-; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmin3_olt_0_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_min3_f16 v0, v0, v1, v2
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -444,96 +444,96 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmin3_olt_1_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_max_f16_e32 v1, v1, v1
; VI-NEXT: v_min_f16_e32 v0, v0, v1
; VI-NEXT: v_max_f16_e32 v1, v2, v2
; VI-NEXT: v_min_f16_e32 v0, v1, v0
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmin3_olt_1_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_min3_f16 v0, v2, v0, v1
-; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmin3_olt_1_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_min3_f16 v0, v2, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -646,103 +646,103 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmin3_olt_0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; VI-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; VI-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmin3_olt_0_f64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
-; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
+; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmin3_olt_0_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s6
-; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s12, s10
+; GFX11-NEXT: s_mov_b32 s13, s11
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -793,103 +793,103 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmin3_olt_1_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; VI-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; VI-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; VI-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmin3_olt_1_f64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
-; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
+; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX9-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmin3_olt_1_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s6
-; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s12, s10
+; GFX11-NEXT: s_mov_b32 s13, s11
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX11-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
index d20c39d..1620ecf 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
@@ -26,15 +26,15 @@ define amdgpu_kernel void @test_fmin_legacy_uge_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmin_legacy_uge_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_nlt_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -78,15 +78,15 @@ define amdgpu_kernel void @test_fmin_legacy_ugt_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmin_legacy_ugt_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_nle_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -130,15 +130,15 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmin_legacy_ule_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_ngt_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -182,15 +182,15 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmin_legacy_ult_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_nge_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -234,15 +234,15 @@ define amdgpu_kernel void @test_fmin_legacy_oge_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmin_legacy_oge_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_ge_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -286,15 +286,15 @@ define amdgpu_kernel void @test_fmin_legacy_ogt_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmin_legacy_ogt_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -338,15 +338,15 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmin_legacy_ole_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_le_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -390,15 +390,15 @@ define amdgpu_kernel void @test_fmin_legacy_olt_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmin_legacy_olt_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll
index 45f6bff..0464b9a 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll
@@ -263,12 +263,12 @@ define amdgpu_kernel void @fminimumi_f32_move_to_valu(ptr addrspace(1) %out, ptr
; GCN: ; %bb.0:
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scope:SCOPE_SYS
+; GCN-NEXT: global_load_b32 v2, v0, s[2:3] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_minimum_f32 v1, v1, v2
; GCN-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -287,12 +287,12 @@ define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr
; GCN: ; %bb.0:
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_load_u16 v1, v0, s[6:7] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: global_load_u16 v2, v0, s[0:1] scope:SCOPE_SYS
+; GCN-NEXT: global_load_u16 v2, v0, s[2:3] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_minimum_f16 v1, v1, v2
; GCN-NEXT: global_store_b16 v0, v1, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
index 98faaac..384ea30 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -58,24 +58,24 @@ define amdgpu_kernel void @fmul_f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -113,38 +113,38 @@ define amdgpu_kernel void @fmul_f16_imm_a(
;
; GFX89-LABEL: fmul_f16_imm_a:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: v_mul_f16_e32 v0, 0x4200, v0
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fmul_f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_mul_f16_e32 v0, 0x4200, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -180,38 +180,38 @@ define amdgpu_kernel void @fmul_f16_imm_b(
;
; GFX89-LABEL: fmul_f16_imm_b:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: v_mul_f16_e32 v0, 4.0, v0
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fmul_f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_mul_f16_e32 v0, 4.0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -309,23 +309,23 @@ define amdgpu_kernel void @fmul_v2f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
-; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: buffer_load_b32 v1, off, s[8:11], 0
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -369,60 +369,60 @@ define amdgpu_kernel void @fmul_v2f16_imm_a(
;
; VI-LABEL: fmul_v2f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; VI-NEXT: v_mov_b32_e32 v1, 0x4400
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mul_f16_e32 v0, 0x4200, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fmul_v2f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s0, 0x44004200
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s4, 0x44004200
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, s0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, s4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fmul_v2f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_mul_f16 v0, 0x44004200, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -464,60 +464,60 @@ define amdgpu_kernel void @fmul_v2f16_imm_b(
;
; VI-LABEL: fmul_v2f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; VI-NEXT: v_mov_b32_e32 v1, 0x4200
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mul_f16_e32 v0, 4.0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fmul_v2f16_imm_b:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s0, 0x42004400
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s4, 0x42004400
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, s0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, s4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fmul_v2f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_mul_f16 v0, 0x42004400, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -631,24 +631,24 @@ define amdgpu_kernel void @fmul_v4f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_mul_f16 v1, v3, v1
; GFX11-NEXT: v_pk_mul_f16 v0, v2, v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -701,18 +701,18 @@ define amdgpu_kernel void @fmul_v4f16_imm_a(
;
; VI-LABEL: fmul_v4f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; VI-NEXT: v_mov_b32_e32 v2, 0x4400
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_f16_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mul_f16_e32 v1, 0x4200, v1
@@ -720,47 +720,47 @@ define amdgpu_kernel void @fmul_v4f16_imm_a(
; VI-NEXT: v_mul_f16_e32 v0, 0x4800, v0
; VI-NEXT: v_or_b32_e32 v1, v1, v2
; VI-NEXT: v_or_b32_e32 v0, v0, v3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fmul_v4f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s2, 0x44004200
-; GFX9-NEXT: s_mov_b32 s3, 0x40004800
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s6, 0x44004200
+; GFX9-NEXT: s_mov_b32 s7, 0x40004800
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_mul_f16 v1, v1, s2
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, s3
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: v_pk_mul_f16 v1, v1, s6
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, s7
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fmul_v4f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_mul_f16 v1, 0x44004200, v1
; GFX11-NEXT: v_pk_mul_f16 v0, 0x40004800, v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
index bde0dc3..25ec5b1 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
@@ -22,19 +22,19 @@ declare half @llvm.fabs.f16(half) #1
define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
; VI-FLUSH-LABEL: fmuladd_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
-; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s4
-; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s5
-; VI-FLUSH-NEXT: v_mov_b32_e32 v4, s6
-; VI-FLUSH-NEXT: v_mov_b32_e32 v5, s7
+; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s6
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s7
+; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s8
+; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s9
+; VI-FLUSH-NEXT: v_mov_b32_e32 v4, s10
+; VI-FLUSH-NEXT: v_mov_b32_e32 v5, s11
; VI-FLUSH-NEXT: flat_load_ushort v6, v[0:1]
; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3]
; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5]
-; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s4
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5
; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
; VI-FLUSH-NEXT: v_mac_f16_e32 v3, v6, v2
; VI-FLUSH-NEXT: flat_store_short v[0:1], v3
@@ -42,19 +42,19 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-DENORM-LABEL: fmuladd_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-DENORM-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
-; VI-DENORM-NEXT: v_mov_b32_e32 v2, s4
-; VI-DENORM-NEXT: v_mov_b32_e32 v3, s5
-; VI-DENORM-NEXT: v_mov_b32_e32 v4, s6
-; VI-DENORM-NEXT: v_mov_b32_e32 v5, s7
+; VI-DENORM-NEXT: v_mov_b32_e32 v0, s6
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s7
+; VI-DENORM-NEXT: v_mov_b32_e32 v2, s8
+; VI-DENORM-NEXT: v_mov_b32_e32 v3, s9
+; VI-DENORM-NEXT: v_mov_b32_e32 v4, s10
+; VI-DENORM-NEXT: v_mov_b32_e32 v5, s11
; VI-DENORM-NEXT: flat_load_ushort v6, v[0:1]
; VI-DENORM-NEXT: flat_load_ushort v2, v[2:3]
; VI-DENORM-NEXT: flat_load_ushort v3, v[4:5]
-; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
+; VI-DENORM-NEXT: v_mov_b32_e32 v0, s4
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s5
; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
; VI-DENORM-NEXT: v_fma_f16 v2, v6, v2, v3
; VI-DENORM-NEXT: flat_store_short v[0:1], v2
@@ -62,65 +62,65 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX10-FLUSH-LABEL: fmuladd_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-FLUSH-NEXT: s_clause 0x2
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3]
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[4:5]
-; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7]
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7]
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[8:9]
+; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DENORM-NEXT: s_clause 0x2
-; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3]
-; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[4:5]
-; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[6:7]
+; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[6:7]
+; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[8:9]
+; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX10-DENORM-NEXT: global_store_short v0, v3, s[0:1]
+; GFX10-DENORM-NEXT: global_store_short v0, v3, s[4:5]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FLUSH-NEXT: s_clause 0x2
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[6:7]
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[8:9]
+; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[10:11]
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DENORM-NEXT: s_clause 0x2
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-DENORM-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[6:7]
+; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[8:9]
+; GFX11-DENORM-NEXT: global_load_u16 v3, v0, s[10:11]
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX11-DENORM-NEXT: global_store_b16 v0, v3, s[0:1]
+; GFX11-DENORM-NEXT: global_store_b16 v0, v3, s[4:5]
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -136,19 +136,19 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %
define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
; VI-FLUSH-LABEL: fmul_fadd_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
-; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s4
-; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s5
-; VI-FLUSH-NEXT: v_mov_b32_e32 v4, s6
-; VI-FLUSH-NEXT: v_mov_b32_e32 v5, s7
+; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s6
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s7
+; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s8
+; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s9
+; VI-FLUSH-NEXT: v_mov_b32_e32 v4, s10
+; VI-FLUSH-NEXT: v_mov_b32_e32 v5, s11
; VI-FLUSH-NEXT: flat_load_ushort v6, v[0:1]
; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3]
; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5]
-; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s4
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5
; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
; VI-FLUSH-NEXT: v_mac_f16_e32 v3, v6, v2
; VI-FLUSH-NEXT: flat_store_short v[0:1], v3
@@ -156,19 +156,19 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1)
;
; VI-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
; VI-DENORM-CONTRACT: ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, s2
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v2, s4
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v3, s5
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v4, s6
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v5, s7
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, s6
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s7
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v2, s8
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v3, s9
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v4, s10
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v5, s11
; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v6, v[0:1]
; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3]
; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v3, v[4:5]
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, s0
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, s4
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s5
; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v6, v2, v3
; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2
@@ -176,100 +176,100 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX10-FLUSH-LABEL: fmul_fadd_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-FLUSH-NEXT: s_clause 0x2
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3]
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[4:5]
-; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7]
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7]
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[8:9]
+; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-STRICT-LABEL: fmul_fadd_f16:
; GFX10-DENORM-STRICT: ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DENORM-STRICT-NEXT: s_clause 0x2
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3]
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[4:5]
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7]
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7]
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[8:9]
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(1)
; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-DENORM-STRICT-NEXT: s_endpgm
;
; GFX10-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
; GFX10-DENORM-CONTRACT: ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DENORM-CONTRACT-NEXT: s_clause 0x2
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3]
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[4:5]
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7]
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7]
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[8:9]
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[4:5]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmul_fadd_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FLUSH-NEXT: s_clause 0x2
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[6:7]
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[8:9]
+; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[10:11]
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-LABEL: fmul_fadd_f16:
; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0
; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DENORM-STRICT-NEXT: s_clause 0x2
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[6:7]
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[8:9]
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[10:11]
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(1)
; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-DENORM-STRICT-NEXT: s_nop 0
; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-STRICT-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DENORM-CONTRACT-NEXT: s_clause 0x2
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[6:7]
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[8:9]
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[10:11]
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v3, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v3, s[4:5]
; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
@@ -286,19 +286,19 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1)
define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
; VI-FLUSH-LABEL: fmul_fadd_contract_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
-; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s4
-; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s5
-; VI-FLUSH-NEXT: v_mov_b32_e32 v4, s6
-; VI-FLUSH-NEXT: v_mov_b32_e32 v5, s7
+; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s6
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s7
+; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s8
+; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s9
+; VI-FLUSH-NEXT: v_mov_b32_e32 v4, s10
+; VI-FLUSH-NEXT: v_mov_b32_e32 v5, s11
; VI-FLUSH-NEXT: flat_load_ushort v6, v[0:1]
; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3]
; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5]
-; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s4
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5
; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
; VI-FLUSH-NEXT: v_mac_f16_e32 v3, v6, v2
; VI-FLUSH-NEXT: flat_store_short v[0:1], v3
@@ -306,19 +306,19 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add
;
; VI-DENORM-LABEL: fmul_fadd_contract_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-DENORM-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
-; VI-DENORM-NEXT: v_mov_b32_e32 v2, s4
-; VI-DENORM-NEXT: v_mov_b32_e32 v3, s5
-; VI-DENORM-NEXT: v_mov_b32_e32 v4, s6
-; VI-DENORM-NEXT: v_mov_b32_e32 v5, s7
+; VI-DENORM-NEXT: v_mov_b32_e32 v0, s6
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s7
+; VI-DENORM-NEXT: v_mov_b32_e32 v2, s8
+; VI-DENORM-NEXT: v_mov_b32_e32 v3, s9
+; VI-DENORM-NEXT: v_mov_b32_e32 v4, s10
+; VI-DENORM-NEXT: v_mov_b32_e32 v5, s11
; VI-DENORM-NEXT: flat_load_ushort v6, v[0:1]
; VI-DENORM-NEXT: flat_load_ushort v2, v[2:3]
; VI-DENORM-NEXT: flat_load_ushort v3, v[4:5]
-; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
+; VI-DENORM-NEXT: v_mov_b32_e32 v0, s4
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s5
; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
; VI-DENORM-NEXT: v_fma_f16 v2, v6, v2, v3
; VI-DENORM-NEXT: flat_store_short v[0:1], v2
@@ -326,65 +326,65 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add
;
; GFX10-FLUSH-LABEL: fmul_fadd_contract_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-FLUSH-NEXT: s_clause 0x2
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3]
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[4:5]
-; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7]
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7]
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[8:9]
+; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmul_fadd_contract_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DENORM-NEXT: s_clause 0x2
-; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3]
-; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[4:5]
-; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[6:7]
+; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[6:7]
+; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[8:9]
+; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX10-DENORM-NEXT: global_store_short v0, v3, s[0:1]
+; GFX10-DENORM-NEXT: global_store_short v0, v3, s[4:5]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmul_fadd_contract_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FLUSH-NEXT: s_clause 0x2
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[6:7]
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[8:9]
+; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[10:11]
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmul_fadd_contract_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DENORM-NEXT: s_clause 0x2
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-DENORM-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[6:7]
+; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[8:9]
+; GFX11-DENORM-NEXT: global_load_u16 v3, v0, s[10:11]
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX11-DENORM-NEXT: global_store_b16 v0, v3, s[0:1]
+; GFX11-DENORM-NEXT: global_store_b16 v0, v3, s[4:5]
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -401,11 +401,11 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add
define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; VI-FLUSH-LABEL: fmuladd_2.0_a_b_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -419,11 +419,11 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp
;
; VI-DENORM-LABEL: fmuladd_2.0_a_b_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -437,59 +437,59 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp
;
; GFX10-FLUSH-LABEL: fmuladd_2.0_a_b_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_2.0_a_b_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-NEXT: global_store_short v0, v2, s[2:3]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_2.0_a_b_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_2.0_a_b_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[2:3]
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -509,11 +509,11 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp
define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; VI-FLUSH-LABEL: fmuladd_a_2.0_b_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -527,11 +527,11 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp
;
; VI-DENORM-LABEL: fmuladd_a_2.0_b_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -545,59 +545,59 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp
;
; GFX10-FLUSH-LABEL: fmuladd_a_2.0_b_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_a_2.0_b_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-NEXT: global_store_short v0, v2, s[2:3]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_a_2.0_b_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_a_2.0_b_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[2:3]
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -617,11 +617,11 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp
define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
; VI-FLUSH-LABEL: fadd_a_a_b_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -635,11 +635,11 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
;
; VI-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16:
; VI-DENORM-CONTRACT: ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -653,90 +653,90 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
;
; GFX10-FLUSH-LABEL: fadd_a_a_b_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-STRICT-LABEL: fadd_a_a_b_f16:
; GFX10-DENORM-STRICT: ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DENORM-STRICT-NEXT: s_endpgm
;
; GFX10-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16:
; GFX10-DENORM-CONTRACT: ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[2:3]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fadd_a_a_b_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-LABEL: fadd_a_a_b_f16:
; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-DENORM-STRICT-NEXT: s_nop 0
; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-STRICT-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16:
; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[2:3]
; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
@@ -759,11 +759,11 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
; VI-FLUSH-LABEL: fadd_b_a_a_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -777,11 +777,11 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
;
; VI-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16:
; VI-DENORM-CONTRACT: ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -795,90 +795,90 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
;
; GFX10-FLUSH-LABEL: fadd_b_a_a_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-STRICT-LABEL: fadd_b_a_a_f16:
; GFX10-DENORM-STRICT: ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v2, v1
-; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DENORM-STRICT-NEXT: s_endpgm
;
; GFX10-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16:
; GFX10-DENORM-CONTRACT: ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[2:3]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fadd_b_a_a_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-LABEL: fadd_b_a_a_f16:
; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v2, v1
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-DENORM-STRICT-NEXT: s_nop 0
; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-STRICT-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16:
; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[2:3]
; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
@@ -901,11 +901,11 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; VI-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -919,11 +919,11 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad
;
; VI-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -937,59 +937,59 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad
;
; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, -2.0, v1
-; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-NEXT: global_store_short v0, v2, s[2:3]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, -2.0, v1
-; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[2:3]
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -1009,11 +1009,11 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad
define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; VI-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1027,11 +1027,11 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt
;
; VI-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1045,59 +1045,59 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt
;
; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-NEXT: global_store_short v0, v2, s[2:3]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[2:3]
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -1119,11 +1119,11 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt
define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; VI-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1137,11 +1137,11 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad
;
; VI-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1155,59 +1155,59 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad
;
; GFX10-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, -2.0, v1
-; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-NEXT: global_store_short v0, v2, s[2:3]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, -2.0, v1
-; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[2:3]
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -1229,11 +1229,11 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad
define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; VI-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1247,11 +1247,11 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad
;
; VI-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1265,59 +1265,59 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad
;
; GFX10-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v2
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fma_f16 v1, v1, 2.0, -v2
-; GFX10-DENORM-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fma_f16 v1, v1, 2.0, -v2
-; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -2358,11 +2358,11 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; VI-FLUSH-LABEL: fsub_c_fadd_a_a_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -2376,11 +2376,11 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp
;
; VI-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16:
; VI-DENORM-CONTRACT: ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -2394,90 +2394,90 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp
;
; GFX10-FLUSH-LABEL: fsub_c_fadd_a_a_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16:
; GFX10-DENORM-STRICT: ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DENORM-STRICT-NEXT: s_endpgm
;
; GFX10-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16:
; GFX10-DENORM-CONTRACT: ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, -2.0, v1
-; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[2:3]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fsub_c_fadd_a_a_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16:
; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-DENORM-STRICT-NEXT: s_nop 0
; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-STRICT-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16:
; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, -2.0, v1
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[2:3]
; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
@@ -2499,11 +2499,11 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp
define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; VI-FLUSH-LABEL: fsub_fadd_a_a_c_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -2517,11 +2517,11 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp
;
; VI-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16:
; VI-DENORM-CONTRACT: ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -2535,90 +2535,90 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp
;
; GFX10-FLUSH-LABEL: fsub_fadd_a_a_c_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v2
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16:
; GFX10-DENORM-STRICT: ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v2
-; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DENORM-STRICT-NEXT: s_endpgm
;
; GFX10-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16:
; GFX10-DENORM-CONTRACT: ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, 2.0, -v2
-; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fsub_fadd_a_a_c_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16:
; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v2
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-DENORM-STRICT-NEXT: s_nop 0
; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-STRICT-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16:
; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, 2.0, -v2
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
index ce5bb66..997db91 100644
--- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -41,24 +41,24 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 {
;
; VI-LABEL: fnearbyint_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rndne_f16_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rndne_f16_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fnearbyint_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_rndne_f16_e32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_rndne_f16_e32 v1, s4
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -81,24 +81,24 @@ define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 {
;
; VI-LABEL: fnearbyint_f32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rndne_f32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rndne_f32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fnearbyint_f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_rndne_f32_e32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_rndne_f32_e32 v1, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -168,14 +168,14 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> %
; VI-LABEL: fnearbyint_v4f32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_rndne_f32_e32 v2, s6
; VI-NEXT: v_rndne_f32_e32 v1, s5
; VI-NEXT: v_rndne_f32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -183,14 +183,14 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> %
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f32_e32 v3, s7
; GFX11-NEXT: v_rndne_f32_e32 v2, s6
; GFX11-NEXT: v_rndne_f32_e32 v1, s5
; GFX11-NEXT: v_rndne_f32_e32 v0, s4
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -237,21 +237,21 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) {
;
; VI-LABEL: nearbyint_f64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rndne_f64_e32 v[0:1], s[2:3]
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_rndne_f64_e32 v[0:1], s[6:7]
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: nearbyint_f64:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[2:3]
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[6:7]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -309,12 +309,12 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %
; VI-LABEL: nearbyint_v2f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f64_e32 v[2:3], s[6:7]
; VI-NEXT: v_rndne_f64_e32 v[0:1], s[4:5]
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -322,12 +322,12 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f64_e32 v[2:3], s[6:7]
; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[4:5]
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -406,18 +406,18 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
; VI-LABEL: nearbyint_v4f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f64_e32 v[6:7], s[10:11]
; VI-NEXT: v_rndne_f64_e32 v[4:5], s[8:9]
; VI-NEXT: v_rndne_f64_e32 v[2:3], s[6:7]
; VI-NEXT: v_rndne_f64_e32 v[0:1], s[4:5]
-; VI-NEXT: s_add_u32 s2, s0, 16
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v11, s3
-; VI-NEXT: v_mov_b32_e32 v9, s1
-; VI-NEXT: v_mov_b32_e32 v10, s2
-; VI-NEXT: v_mov_b32_e32 v8, s0
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v11, s1
+; VI-NEXT: v_mov_b32_e32 v9, s3
+; VI-NEXT: v_mov_b32_e32 v10, s0
+; VI-NEXT: v_mov_b32_e32 v8, s2
; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; VI-NEXT: s_endpgm
@@ -426,7 +426,7 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x44
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f64_e32 v[6:7], s[10:11]
@@ -434,8 +434,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
; GFX11-NEXT: v_rndne_f64_e32 v[2:3], s[6:7]
; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[4:5]
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v8, v[4:7], s[2:3] offset:16
+; GFX11-NEXT: global_store_b128 v8, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 277dc01..c19f7d1 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -3036,21 +3036,21 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1
;
; VI-LABEL: s_fneg_select_infloop_regression_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; VI-NEXT: v_bfrev_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitcmp1_b32 s4, 0
-; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
+; VI-NEXT: s_bitcmp1_b32 s6, 0
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_and_b64 s[6:7], s[4:5], exec
-; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
+; VI-NEXT: s_and_b64 s[6:7], s[0:1], exec
+; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[0:1]
; VI-NEXT: s_cselect_b32 s2, 0, s2
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%i = select i1 %arg1, double 0.0, double %arg
@@ -3096,17 +3096,17 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %a
;
; VI-LABEL: s_fneg_select_infloop_regression_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; VI-NEXT: s_load_dword s4, s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitcmp1_b32 s2, 16
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
-; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3]
+; VI-NEXT: s_bitcmp1_b32 s4, 16
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1]
; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
%i = select i1 %arg1, half 0.0, half %arg
@@ -3236,19 +3236,19 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %a
; VI-LABEL: s_fneg_select_infloop_regression_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_bfrev_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s6, 0
; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
-; VI-NEXT: v_cndmask_b32_e64 v2, -v1, v0, s[2:3]
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: v_cndmask_b32_e64 v2, -v1, v0, s[0:1]
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[2:3]
-; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[2:3]
-; VI-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[0:1]
+; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[0:1]
+; VI-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%i = select i1 %arg1, <2 x float> zeroinitializer, <2 x float> %arg
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
index 2c9042e..e3d3fdd 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
@@ -21,11 +21,11 @@ define amdgpu_kernel void @fneg_fabs_fadd_f64(ptr addrspace(1) %out, double %x,
; VI-LABEL: fneg_fabs_fadd_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_add_f64 v[0:1], s[0:1], -|v[0:1]|
+; VI-NEXT: v_add_f64 v[0:1], s[2:3], -|v[0:1]|
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -52,13 +52,13 @@ define amdgpu_kernel void @v_fneg_fabs_fadd_f64(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: v_fneg_fabs_fadd_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f64 v[0:1], s[2:3], -|s[2:3]|
+; VI-NEXT: v_add_f64 v[0:1], s[0:1], -|s[0:1]|
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%x = load double, ptr addrspace(1) %xptr, align 8
@@ -89,11 +89,11 @@ define amdgpu_kernel void @fneg_fabs_fmul_f64(ptr addrspace(1) %out, double %x,
; VI-LABEL: fneg_fabs_fmul_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mul_f64 v[0:1], s[0:1], -|v[0:1]|
+; VI-NEXT: v_mul_f64 v[0:1], s[2:3], -|v[0:1]|
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -122,12 +122,12 @@ define amdgpu_kernel void @fneg_fabs_free_f64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: fneg_fabs_free_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_or_b32 s0, s3, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_or_b32 s0, s7, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -155,12 +155,12 @@ define amdgpu_kernel void @fneg_fabs_fn_free_f64(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: fneg_fabs_fn_free_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_or_b32 s0, s3, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_or_b32 s0, s7, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -188,13 +188,13 @@ define amdgpu_kernel void @fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], doubl
; VI-LABEL: fneg_fabs_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s3, 31
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_or_b32 s0, s3, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%fabs = call double @llvm.fabs.f64(double %in)
@@ -223,16 +223,16 @@ define amdgpu_kernel void @fneg_fabs_v2f64(ptr addrspace(1) %out, <2 x double> %
; VI-LABEL: fneg_fabs_v2f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_or_b32 s2, s7, 0x80000000
-; VI-NEXT: s_or_b32 s3, s5, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: s_or_b32 s0, s7, 0x80000000
+; VI-NEXT: s_or_b32 s1, s5, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v3, s2
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
%fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
@@ -268,27 +268,27 @@ define amdgpu_kernel void @fneg_fabs_v4f64(ptr addrspace(1) %out, <4 x double> %
; VI-LABEL: fneg_fabs_v4f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset1_b32 s7, 31
; VI-NEXT: s_bitset1_b32 s5, 31
-; VI-NEXT: s_or_b32 s2, s11, 0x80000000
-; VI-NEXT: s_or_b32 s3, s9, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v3, s2
-; VI-NEXT: s_add_u32 s2, s0, 16
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_or_b32 s0, s11, 0x80000000
+; VI-NEXT: s_or_b32 s1, s9, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v2, s10
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
%fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
index 32033c5..2a1ca0f 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -79,13 +79,13 @@ define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: fneg_fabsf_free_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_or_b32 s0, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%bc = bitcast i32 %in to float
@@ -141,13 +141,13 @@ define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) {
;
; VI-LABEL: fneg_fabsf_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_or_b32 s0, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%fabs = call float @llvm.fabs.f32(float %in)
@@ -177,13 +177,13 @@ define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace
;
; VI-LABEL: v_fneg_fabsf_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_load_dword v2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_e32 v2, 0x80000000, v2
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -251,18 +251,18 @@ define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %
; VI-LABEL: fneg_fabsf_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_or_b32 s2, s7, 0x80000000
-; VI-NEXT: s_or_b32 s3, s6, 0x80000000
+; VI-NEXT: s_or_b32 s0, s7, 0x80000000
+; VI-NEXT: s_or_b32 s1, s6, 0x80000000
; VI-NEXT: s_bitset1_b32 s5, 31
; VI-NEXT: s_bitset1_b32 s4, 31
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_mov_b32_e32 v3, s2
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
%fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll
index 94fc929..66b5cad 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.ll
@@ -19,26 +19,26 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) {
;
; VI-LABEL: s_fneg_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_xor_b32 s2, s2, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_xor_b32 s0, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_xor_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -116,18 +116,18 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl
; VI-LABEL: s_fneg_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_xor_b32 s2, s7, 0x80000000
-; VI-NEXT: s_xor_b32 s3, s6, 0x80000000
+; VI-NEXT: s_xor_b32 s0, s7, 0x80000000
+; VI-NEXT: s_xor_b32 s1, s6, 0x80000000
; VI-NEXT: s_xor_b32 s5, s5, 0x80000000
; VI-NEXT: s_xor_b32 s4, s4, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_mov_b32_e32 v3, s2
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -135,17 +135,17 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s2, s7, 0x80000000
-; GFX11-NEXT: s_xor_b32 s3, s6, 0x80000000
+; GFX11-NEXT: s_xor_b32 s0, s7, 0x80000000
+; GFX11-NEXT: s_xor_b32 s1, s6, 0x80000000
; GFX11-NEXT: s_xor_b32 s4, s4, 0x80000000
; GFX11-NEXT: s_xor_b32 s5, s5, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
-; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s2
-; GFX11-NEXT: v_mov_b32_e32 v2, s3
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -168,24 +168,24 @@ define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: fsub0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_sub_f32_e64 v2, 0, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_sub_f32_e64 v2, 0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fsub0_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_sub_f32_e64 v1, 0, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_sub_f32_e64 v1, 0, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -210,26 +210,26 @@ define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: fneg_free_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_xor_b32 s2, s2, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_xor_b32 s0, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fneg_free_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_xor_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -253,24 +253,24 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) {
;
; VI-LABEL: fneg_fold_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mul_f32_e64 v2, -s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mul_f32_e64 v2, -s4, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fneg_fold_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mul_f32_e64 v1, -s2, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_mul_f32_e64 v1, -s4, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -295,24 +295,24 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in
;
; VI-LABEL: bitpreserve_fneg_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mul_f32_e64 v2, s2, -4.0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mul_f32_e64 v2, s4, -4.0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: bitpreserve_fneg_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mul_f32_e64 v1, s2, -4.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_mul_f32_e64 v1, s4, -4.0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -339,26 +339,26 @@ define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: s_fneg_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_xor_b32 s2, s2, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_xor_b32 s0, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_xor_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -391,24 +391,24 @@ define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: s_fneg_i32_fp_use:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_sub_f32_e64 v2, 2.0, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_sub_f32_e64 v2, 2.0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_i32_fp_use:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_sub_f32_e64 v1, 2.0, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_sub_f32_e64 v1, 2.0, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -448,25 +448,25 @@ define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: s_fneg_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_xor_b32 s0, s3, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_xor_b32 s0, s7, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_i64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000
+; GFX11-NEXT: s_xor_b32 s0, s7, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -500,21 +500,21 @@ define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: s_fneg_i64_fp_use:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_f64 v[0:1], -s[6:7], 2.0
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_i64_fp_use:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_add_f64 v[0:1], -s[6:7], 2.0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -563,24 +563,24 @@ define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) {
;
; VI-LABEL: s_fneg_i16_fp_use:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_sub_f16_e64 v2, 2.0, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_sub_f16_e64 v2, 2.0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_i16_fp_use:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_sub_f16_e64 v1, 2.0, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_sub_f16_e64 v1, 2.0, s4
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -631,31 +631,31 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) {
;
; VI-LABEL: s_fneg_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s3, s2, 16
-; VI-NEXT: s_xor_b32 s2, s2, 0x8000
-; VI-NEXT: s_xor_b32 s3, s3, 0x8000
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-NEXT: s_lshl_b32 s3, s3, 16
-; VI-NEXT: s_or_b32 s2, s2, s3
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_lshr_b32 s0, s4, 16
+; VI-NEXT: s_xor_b32 s1, s4, 0x8000
+; VI-NEXT: s_xor_b32 s0, s0, 0x8000
+; VI-NEXT: s_and_b32 s1, s1, 0xffff
+; VI-NEXT: s_lshl_b32 s0, s0, 16
+; VI-NEXT: s_or_b32 s0, s1, s0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_v2i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000
+; GFX11-NEXT: s_xor_b32 s0, s4, 0x80008000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -714,31 +714,31 @@ define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg)
;
; VI-LABEL: s_fneg_v2i16_fp_use:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x4000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s3, s2, 16
-; VI-NEXT: s_xor_b32 s3, s3, 0x8000
-; VI-NEXT: s_xor_b32 s2, s2, 0x8000
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_add_f16_e64 v1, s2, 2.0
+; VI-NEXT: s_lshr_b32 s0, s4, 16
+; VI-NEXT: s_xor_b32 s0, s0, 0x8000
+; VI-NEXT: s_xor_b32 s1, s4, 0x8000
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_add_f16_e64 v1, s1, 2.0
; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v1, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_v2i16_fp_use:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_add_f16 v1, s2, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_pk_add_f16 v1, s4, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
index 7f87b41..157b748 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
@@ -37,10 +37,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat:
; GFX940: ; %bb.0:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -49,10 +49,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
;
; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -76,10 +76,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
; GFX940: ; %bb.0:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -88,10 +88,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
;
; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
index ca2fa0f..afca450 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
@@ -22,28 +22,28 @@ define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, f
;
; VI-LABEL: test_isinf_pattern:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x204
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isinf_pattern:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x204
+; GFX11-NEXT: v_cmp_class_f32_e64 s0, s4, 0x204
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -70,28 +70,28 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture %
;
; VI-LABEL: test_not_isinf_pattern_0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_nlg_f32_e64 s[2:3], |s2|, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_cmp_nlg_f32_e64 s[0:1], |s4|, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_not_isinf_pattern_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x7f800000, |s2|
+; GFX11-NEXT: v_cmp_nlg_f32_e64 s0, 0x7f800000, |s4|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -115,20 +115,20 @@ define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture %
;
; VI-LABEL: test_not_isinf_pattern_1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_not_isinf_pattern_1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -155,28 +155,28 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o
;
; VI-LABEL: test_isfinite_pattern_0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_pattern_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8
+; GFX11-NEXT: v_cmp_class_f32_e64 s0, s4, 0x1f8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -205,28 +205,28 @@ define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %o
;
; VI-LABEL: test_isfinite_pattern_1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_pattern_1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8
+; GFX11-NEXT: v_cmp_class_f32_e64 s0, s4, 0x1f8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -253,27 +253,27 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur
;
; VI-LABEL: test_isfinite_not_pattern_0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_cmp_o_f32_e64 s[0:1], s4, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_not_pattern_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2
+; GFX11-NEXT: v_cmp_o_f32_e64 s0, s4, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -306,31 +306,31 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur
; VI-LABEL: test_isfinite_not_pattern_1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s4, s4
+; VI-NEXT: v_cmp_o_f32_e64 s[0:1], s4, s4
; VI-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0
-; VI-NEXT: s_and_b64 s[2:3], s[2:3], vcc
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_and_b64 s[0:1], s[0:1], vcc
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_not_pattern_1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s2
-; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, s2
+; GFX11-NEXT: v_cmp_o_f32_e64 s0, s4, s4
+; GFX11-NEXT: v_cmp_neq_f32_e64 s1, 0x7f800000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s2, s3, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_and_b32 s0, s0, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -417,31 +417,31 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur
; VI-LABEL: test_isfinite_not_pattern_3:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_u_f32_e64 s[2:3], s4, s4
+; VI-NEXT: v_cmp_u_f32_e64 s[0:1], s4, s4
; VI-NEXT: v_cmp_neq_f32_e64 s[4:5], |s4|, v0
-; VI-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_not_pattern_3:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_u_f32_e64 s3, s2, s2
-; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, |s2|
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, s4, s4
+; GFX11-NEXT: v_cmp_neq_f32_e64 s1, 0x7f800000, |s4|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s2, s3, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_and_b32 s0, s0, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -470,28 +470,28 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o
;
; VI-LABEL: test_isfinite_pattern_4:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_pattern_4:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8
+; GFX11-NEXT: v_cmp_class_f32_e64 s0, s4, 0x1f8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -520,28 +520,28 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1)
;
; VI-LABEL: test_isfinite_pattern_4_commute_and:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_pattern_4_commute_and:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8
+; GFX11-NEXT: v_cmp_class_f32_e64 s0, s4, 0x1f8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -592,17 +592,17 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp
; GFX11-LABEL: test_not_isfinite_pattern_4_wrong_ord_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x50
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x50
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s3
-; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8
+; GFX11-NEXT: v_cmp_class_f32_e64 s1, s4, 0x1f8
+; GFX11-NEXT: v_cmp_o_f32_e64 s0, s4, s5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s2, s3, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_and_b32 s0, s0, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -632,28 +632,28 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou
;
; VI-LABEL: test_isinf_pattern_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x204
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isinf_pattern_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x204
+; GFX11-NEXT: v_cmp_class_f16_e64 s0, s4, 0x204
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -684,28 +684,28 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur
;
; VI-LABEL: test_isfinite_pattern_0_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_pattern_0_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8
+; GFX11-NEXT: v_cmp_class_f16_e64 s0, s4, 0x1f8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -738,28 +738,28 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur
;
; VI-LABEL: test_isfinite_pattern_4_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_pattern_4_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8
+; GFX11-NEXT: v_cmp_class_f16_e64 s0, s4, 0x1f8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
index 2928647..0b49b73 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
@@ -52,22 +52,22 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc
; GFX1030-LABEL: raw_buffer_atomic_min_noret_f32:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_clause 0x1
-; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen
+; GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen
; GFX1030-NEXT: s_endpgm
;
; GFX1100-LABEL: raw_buffer_atomic_min_noret_f32:
; GFX1100: ; %bb.0: ; %main_body
; GFX1100-NEXT: s_clause 0x1
-; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1100-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen
+; GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 0 offen
; GFX1100-NEXT: s_nop 0
; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-NEXT: s_endpgm
@@ -75,11 +75,11 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc
; GFX12-LABEL: raw_buffer_atomic_min_noret_f32:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[4:7], null offen
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -118,22 +118,22 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc
; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f32:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x1
-; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen
; G_GFX1030-NEXT: s_endpgm
;
; G_GFX1100-LABEL: raw_buffer_atomic_min_noret_f32:
; G_GFX1100: ; %bb.0: ; %main_body
; G_GFX1100-NEXT: s_clause 0x1
-; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen
+; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 0 offen
; G_GFX1100-NEXT: s_nop 0
; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; G_GFX1100-NEXT: s_endpgm
@@ -318,13 +318,13 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre
; GFX12-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b96 s[8:10], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s0, 4
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_mov_b32 s4, 4
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_NT_RETURN
-; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[4:7], s0 offen th:TH_ATOMIC_NT_RETURN
+; GFX12-NEXT: v_mov_b32_e32 v1, s10
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: ds_store_b32 v1, v0
; GFX12-NEXT: s_endpgm
@@ -444,22 +444,22 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc
; GFX1030-LABEL: raw_buffer_atomic_max_noret_f32:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_clause 0x1
-; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen
+; GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen
; GFX1030-NEXT: s_endpgm
;
; GFX1100-LABEL: raw_buffer_atomic_max_noret_f32:
; GFX1100: ; %bb.0: ; %main_body
; GFX1100-NEXT: s_clause 0x1
-; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1100-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen
+; GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[4:7], 0 offen
; GFX1100-NEXT: s_nop 0
; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-NEXT: s_endpgm
@@ -467,11 +467,11 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc
; GFX12-LABEL: raw_buffer_atomic_max_noret_f32:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[4:7], null offen
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -510,22 +510,22 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc
; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f32:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x1
-; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen
; G_GFX1030-NEXT: s_endpgm
;
; G_GFX1100-LABEL: raw_buffer_atomic_max_noret_f32:
; G_GFX1100: ; %bb.0: ; %main_body
; G_GFX1100-NEXT: s_clause 0x1
-; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen
+; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[4:7], 0 offen
; G_GFX1100-NEXT: s_nop 0
; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; G_GFX1100-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
index f4745a5..c35da12 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
@@ -104,22 +104,22 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8)
; G_GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f32:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x1
-; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen
; G_GFX1030-NEXT: s_endpgm
;
; G_GFX1100-LABEL: raw_ptr_buffer_atomic_min_noret_f32:
; G_GFX1100: ; %bb.0: ; %main_body
; G_GFX1100-NEXT: s_clause 0x1
-; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen
+; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 0 offen
; G_GFX1100-NEXT: s_nop 0
; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; G_GFX1100-NEXT: s_endpgm
@@ -462,22 +462,22 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8)
; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f32:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x1
-; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen
; G_GFX1030-NEXT: s_endpgm
;
; G_GFX1100-LABEL: raw_ptr_buffer_atomic_max_noret_f32:
; G_GFX1100: ; %bb.0: ; %main_body
; G_GFX1100-NEXT: s_clause 0x1
-; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen
+; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[4:7], 0 offen
; G_GFX1100-NEXT: s_nop 0
; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; G_GFX1100-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index bf3dbec..2663bd0 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -1022,22 +1022,22 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) {
; GFX90A-LABEL: global_atomic_fadd_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s2
-; GFX90A-NEXT: v_mov_b32_e32 v1, s3
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s6
+; GFX90A-NEXT: v_mov_b32_e32 v1, s7
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1047,22 +1047,22 @@ main_body:
define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) {
; GFX90A-LABEL: global_atomic_fmin_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s2
-; GFX90A-NEXT: v_mov_b32_e32 v1, s3
-; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s6
+; GFX90A-NEXT: v_mov_b32_e32 v1, s7
+; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fmin_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-NEXT: global_atomic_min_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1072,22 +1072,22 @@ main_body:
define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) {
; GFX90A-LABEL: global_atomic_fmax_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s2
-; GFX90A-NEXT: v_mov_b32_e32 v1, s3
-; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s6
+; GFX90A-NEXT: v_mov_b32_e32 v1, s7
+; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fmax_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-NEXT: global_atomic_max_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1097,35 +1097,35 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[2:3], exec
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], exec
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB39_3
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s4
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB39_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB39_2
; GFX90A-NEXT: .LBB39_3:
; GFX90A-NEXT: s_endpgm
@@ -1139,14 +1139,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB39_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: .LBB39_2:
@@ -1166,13 +1166,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB40_2
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: .LBB40_2:
@@ -1187,14 +1187,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB40_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: .LBB40_2:
@@ -1207,35 +1207,35 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[2:3], exec
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], exec
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB41_3
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s4
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB41_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB41_2
; GFX90A-NEXT: .LBB41_3:
; GFX90A-NEXT: s_endpgm
@@ -1249,14 +1249,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB41_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: .LBB41_2:
@@ -1276,13 +1276,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB42_2
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: .LBB42_2:
@@ -1297,14 +1297,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB42_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: .LBB42_2:
@@ -1479,33 +1479,33 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[2:3], exec
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], exec
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB49_3
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s4
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB49_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB49_2
; GFX90A-NEXT: .LBB49_3:
; GFX90A-NEXT: s_endpgm
@@ -1519,14 +1519,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB49_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: .LBB49_2:
@@ -1564,10 +1564,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1581,11 +1581,11 @@ main_body:
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
@@ -1593,10 +1593,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1636,10 +1636,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -1760,23 +1760,23 @@ main_body:
define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s1
-; GFX90A-NEXT: v_mov_b32_e32 v2, s2
-; GFX90A-NEXT: v_mov_b32_e32 v3, s3
+; GFX90A-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NEXT: v_mov_b32_e32 v2, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s7
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s0
-; GFX940-NEXT: v_mov_b32_e32 v1, s1
-; GFX940-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-NEXT: v_mov_b32_e32 v3, s3
+; GFX940-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-NEXT: v_mov_b32_e32 v2, s6
+; GFX940-NEXT: v_mov_b32_e32 v3, s7
; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_endpgm
main_body:
@@ -1829,10 +1829,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1846,23 +1846,23 @@ main_body:
define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) {
; GFX90A-LABEL: flat_atomic_fmin_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s1
-; GFX90A-NEXT: v_mov_b32_e32 v2, s2
-; GFX90A-NEXT: v_mov_b32_e32 v3, s3
+; GFX90A-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NEXT: v_mov_b32_e32 v2, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s7
; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: flat_atomic_fmin_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s0
-; GFX940-NEXT: v_mov_b32_e32 v1, s1
-; GFX940-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-NEXT: v_mov_b32_e32 v3, s3
+; GFX940-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-NEXT: v_mov_b32_e32 v2, s6
+; GFX940-NEXT: v_mov_b32_e32 v3, s7
; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_endpgm
main_body:
@@ -1892,23 +1892,23 @@ main_body:
define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) {
; GFX90A-LABEL: flat_atomic_fmax_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s1
-; GFX90A-NEXT: v_mov_b32_e32 v2, s2
-; GFX90A-NEXT: v_mov_b32_e32 v3, s3
+; GFX90A-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NEXT: v_mov_b32_e32 v2, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s7
; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: flat_atomic_fmax_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s0
-; GFX940-NEXT: v_mov_b32_e32 v1, s1
-; GFX940-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-NEXT: v_mov_b32_e32 v3, s3
+; GFX940-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-NEXT: v_mov_b32_e32 v2, s6
+; GFX940-NEXT: v_mov_b32_e32 v3, s7
; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_endpgm
main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
index a058c11..f710456 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
@@ -54,14 +54,14 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc
; GFX1030-LABEL: raw_buffer_atomic_min_noret_f64:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_clause 0x2
-; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030-NEXT: s_load_dword s8, s[0:1], 0x3c
+; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; GFX1030-NEXT: v_mov_b32_e32 v2, s6
-; GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
+; GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; GFX1030-NEXT: v_mov_b32_e32 v2, s8
+; GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
; GFX1030-NEXT: s_endpgm
;
; G_SI-LABEL: raw_buffer_atomic_min_noret_f64:
@@ -104,14 +104,14 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc
; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f64:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x2
-; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
-; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dword s8, s[0:1], 0x3c
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT: v_mov_b32_e32 v2, s6
-; G_GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: v_mov_b32_e32 v2, s8
+; G_GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
; G_GFX1030-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
@@ -291,14 +291,14 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc
; GFX1030-LABEL: raw_buffer_atomic_max_noret_f64:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_clause 0x2
-; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030-NEXT: s_load_dword s8, s[0:1], 0x3c
+; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; GFX1030-NEXT: v_mov_b32_e32 v2, s6
-; GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
+; GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; GFX1030-NEXT: v_mov_b32_e32 v2, s8
+; GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
; GFX1030-NEXT: s_endpgm
;
; G_SI-LABEL: raw_buffer_atomic_max_noret_f64:
@@ -341,14 +341,14 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc
; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f64:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x2
-; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
-; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dword s8, s[0:1], 0x3c
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT: v_mov_b32_e32 v2, s6
-; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: v_mov_b32_e32 v2, s8
+; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
; G_GFX1030-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
index 046c92a..f308174 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
@@ -104,14 +104,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8)
; G_GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x2
-; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
-; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dword s8, s[0:1], 0x3c
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT: v_mov_b32_e32 v2, s6
-; G_GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: v_mov_b32_e32 v2, s8
+; G_GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
; G_GFX1030-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -341,14 +341,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8)
; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x2
-; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
-; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dword s8, s[0:1], 0x3c
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT: v_mov_b32_e32 v2, s6
-; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: v_mov_b32_e32 v2, s8
+; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
; G_GFX1030-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
index b4fee70..facb3e5 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -20,12 +20,12 @@ define amdgpu_kernel void @fp_to_sint_i32(ptr addrspace(1) %out, float %in) {
; VI-LABEL: fp_to_sint_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_i32_f32_e32 v0, s2
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_sint_i32:
@@ -59,12 +59,12 @@ define amdgpu_kernel void @fp_to_sint_i32_fabs(ptr addrspace(1) %out, float %in)
; VI-LABEL: fp_to_sint_i32_fabs:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_i32_f32_e64 v0, |s2|
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_sint_i32_fabs:
@@ -147,17 +147,17 @@ define amdgpu_kernel void @fp_to_sint_v4i32(ptr addrspace(1) %out, ptr addrspace
;
; VI-LABEL: fp_to_sint_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_i32_f32_e32 v3, s7
-; VI-NEXT: v_cvt_i32_f32_e32 v2, s6
-; VI-NEXT: v_cvt_i32_f32_e32 v1, s5
-; VI-NEXT: v_cvt_i32_f32_e32 v0, s4
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: v_cvt_i32_f32_e32 v3, s3
+; VI-NEXT: v_cvt_i32_f32_e32 v2, s2
+; VI-NEXT: v_cvt_i32_f32_e32 v1, s1
+; VI-NEXT: v_cvt_i32_f32_e32 v0, s0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_sint_v4i32:
@@ -217,24 +217,24 @@ define amdgpu_kernel void @fp_to_sint_i64 (ptr addrspace(1) %out, float %in) {
; VI-LABEL: fp_to_sint_i64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s4, 0x2f800000
-; VI-NEXT: s_mov_b32 s5, 0xcf800000
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s0, 0x2f800000
+; VI-NEXT: s_mov_b32 s1, 0xcf800000
+; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_trunc_f32_e32 v0, s2
-; VI-NEXT: v_mul_f32_e64 v1, |v0|, s4
+; VI-NEXT: v_mul_f32_e64 v1, |v0|, s0
; VI-NEXT: v_floor_f32_e32 v1, v1
-; VI-NEXT: v_fma_f32 v2, v1, s5, |v0|
+; VI-NEXT: v_fma_f32 v2, v1, s1, |v0|
; VI-NEXT: v_cvt_u32_f32_e32 v2, v2
; VI-NEXT: v_cvt_u32_f32_e32 v1, v1
; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_xor_b32_e32 v0, v2, v3
; VI-NEXT: v_xor_b32_e32 v1, v1, v3
; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v3
; VI-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_sint_i64:
@@ -509,24 +509,24 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> %
;
; VI-LABEL: fp_to_sint_v4i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s8, 0x2f800000
-; VI-NEXT: s_mov_b32 s9, 0xcf800000
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s0, 0x2f800000
+; VI-NEXT: s_mov_b32 s1, 0xcf800000
+; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_trunc_f32_e32 v0, s5
-; VI-NEXT: v_mul_f32_e64 v1, |v0|, s8
+; VI-NEXT: v_trunc_f32_e32 v0, s9
+; VI-NEXT: v_mul_f32_e64 v1, |v0|, s0
; VI-NEXT: v_floor_f32_e32 v1, v1
-; VI-NEXT: v_fma_f32 v2, v1, s9, |v0|
+; VI-NEXT: v_fma_f32 v2, v1, s1, |v0|
; VI-NEXT: v_cvt_u32_f32_e32 v2, v2
-; VI-NEXT: v_trunc_f32_e32 v4, s4
+; VI-NEXT: v_trunc_f32_e32 v4, s8
; VI-NEXT: v_cvt_u32_f32_e32 v1, v1
-; VI-NEXT: v_mul_f32_e64 v3, |v4|, s8
+; VI-NEXT: v_mul_f32_e64 v3, |v4|, s0
; VI-NEXT: v_floor_f32_e32 v3, v3
; VI-NEXT: v_ashrrev_i32_e32 v0, 31, v0
; VI-NEXT: v_cvt_u32_f32_e32 v5, v3
-; VI-NEXT: v_fma_f32 v3, v3, s9, |v4|
+; VI-NEXT: v_fma_f32 v3, v3, s1, |v4|
; VI-NEXT: v_xor_b32_e32 v2, v2, v0
; VI-NEXT: v_cvt_u32_f32_e32 v6, v3
; VI-NEXT: v_xor_b32_e32 v1, v1, v0
@@ -534,22 +534,22 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> %
; VI-NEXT: v_subb_u32_e32 v3, vcc, v1, v0, vcc
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v4
; VI-NEXT: v_xor_b32_e32 v4, v5, v1
-; VI-NEXT: v_trunc_f32_e32 v5, s7
+; VI-NEXT: v_trunc_f32_e32 v5, s11
; VI-NEXT: v_xor_b32_e32 v0, v6, v1
-; VI-NEXT: v_mul_f32_e64 v6, |v5|, s8
+; VI-NEXT: v_mul_f32_e64 v6, |v5|, s0
; VI-NEXT: v_floor_f32_e32 v6, v6
; VI-NEXT: v_cvt_u32_f32_e32 v7, v6
-; VI-NEXT: v_fma_f32 v6, v6, s9, |v5|
+; VI-NEXT: v_fma_f32 v6, v6, s1, |v5|
; VI-NEXT: v_cvt_u32_f32_e32 v6, v6
; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1
; VI-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
; VI-NEXT: v_ashrrev_i32_e32 v4, 31, v5
-; VI-NEXT: v_trunc_f32_e32 v8, s6
+; VI-NEXT: v_trunc_f32_e32 v8, s10
; VI-NEXT: v_xor_b32_e32 v5, v6, v4
-; VI-NEXT: v_mul_f32_e64 v6, |v8|, s8
+; VI-NEXT: v_mul_f32_e64 v6, |v8|, s0
; VI-NEXT: v_floor_f32_e32 v6, v6
; VI-NEXT: v_cvt_u32_f32_e32 v9, v6
-; VI-NEXT: v_fma_f32 v6, v6, s9, |v8|
+; VI-NEXT: v_fma_f32 v6, v6, s1, |v8|
; VI-NEXT: v_cvt_u32_f32_e32 v10, v6
; VI-NEXT: v_xor_b32_e32 v7, v7, v4
; VI-NEXT: v_sub_u32_e32 v6, vcc, v5, v4
@@ -558,10 +558,10 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> %
; VI-NEXT: v_xor_b32_e32 v4, v10, v5
; VI-NEXT: v_xor_b32_e32 v8, v9, v5
; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v5
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_subb_u32_e32 v5, vcc, v8, v5, vcc
-; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_sint_v4i64:
@@ -749,14 +749,14 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in
;
; VI-LABEL: fp_to_uint_f32_to_i1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], -1.0, s4
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT: v_cmp_eq_f32_e64 s[0:1], -1.0, s2
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_f32_to_i1:
@@ -799,14 +799,14 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa
;
; VI-LABEL: fp_to_uint_fabs_f32_to_i1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], -1.0, |s4|
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT: v_cmp_eq_f32_e64 s[0:1], -1.0, |s2|
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_fabs_f32_to_i1:
@@ -850,12 +850,12 @@ define amdgpu_kernel void @fp_to_sint_f32_i16(ptr addrspace(1) %out, float %in)
; VI-LABEL: fp_to_sint_f32_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_i32_f32_e32 v0, s2
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_sint_f32_i16:
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
index f8ede1c..364e8ca 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
@@ -20,12 +20,12 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i32 (ptr addrspace(1) %out, float %
; VI-LABEL: fp_to_uint_f32_to_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_u32_f32_e32 v0, s2
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_f32_to_i32:
@@ -107,17 +107,17 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(ptr addrspace(1) %out, ptr
;
; VI-LABEL: fp_to_uint_v4f32_to_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_u32_f32_e32 v3, s7
-; VI-NEXT: v_cvt_u32_f32_e32 v2, s6
-; VI-NEXT: v_cvt_u32_f32_e32 v1, s5
-; VI-NEXT: v_cvt_u32_f32_e32 v0, s4
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: v_cvt_u32_f32_e32 v3, s3
+; VI-NEXT: v_cvt_u32_f32_e32 v2, s2
+; VI-NEXT: v_cvt_u32_f32_e32 v1, s1
+; VI-NEXT: v_cvt_u32_f32_e32 v0, s0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_v4f32_to_v4i32:
@@ -170,18 +170,18 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i64(ptr addrspace(1) %out, float %x
; VI-LABEL: fp_to_uint_f32_to_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xcf800000
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s0, 0xcf800000
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_trunc_f32_e32 v0, s2
; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; VI-NEXT: v_floor_f32_e32 v2, v1
-; VI-NEXT: v_fma_f32 v0, v2, s3, v0
+; VI-NEXT: v_fma_f32 v0, v2, s0, v0
; VI-NEXT: v_cvt_u32_f32_e32 v1, v2
; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_f32_to_i64:
@@ -412,38 +412,38 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x
;
; VI-LABEL: fp_to_uint_v4f32_to_v4i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s2, 0xcf800000
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s0, 0xcf800000
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_trunc_f32_e32 v0, s5
-; VI-NEXT: v_trunc_f32_e32 v4, s4
+; VI-NEXT: v_trunc_f32_e32 v0, s9
+; VI-NEXT: v_trunc_f32_e32 v4, s8
; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; VI-NEXT: v_mul_f32_e32 v2, 0x2f800000, v4
; VI-NEXT: v_floor_f32_e32 v5, v1
; VI-NEXT: v_floor_f32_e32 v6, v2
-; VI-NEXT: v_fma_f32 v0, v5, s2, v0
+; VI-NEXT: v_fma_f32 v0, v5, s0, v0
; VI-NEXT: v_cvt_u32_f32_e32 v2, v0
-; VI-NEXT: v_fma_f32 v0, v6, s2, v4
-; VI-NEXT: v_trunc_f32_e32 v4, s7
+; VI-NEXT: v_fma_f32 v0, v6, s0, v4
+; VI-NEXT: v_trunc_f32_e32 v4, s11
; VI-NEXT: v_cvt_u32_f32_e32 v3, v5
; VI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; VI-NEXT: v_trunc_f32_e32 v8, s6
+; VI-NEXT: v_trunc_f32_e32 v8, s10
; VI-NEXT: v_cvt_u32_f32_e32 v1, v6
; VI-NEXT: v_floor_f32_e32 v6, v5
; VI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v8
; VI-NEXT: v_floor_f32_e32 v9, v5
-; VI-NEXT: v_fma_f32 v4, v6, s2, v4
+; VI-NEXT: v_fma_f32 v4, v6, s0, v4
; VI-NEXT: v_cvt_u32_f32_e32 v7, v6
; VI-NEXT: v_cvt_u32_f32_e32 v6, v4
-; VI-NEXT: v_fma_f32 v4, v9, s2, v8
+; VI-NEXT: v_fma_f32 v4, v9, s0, v8
; VI-NEXT: v_cvt_u32_f32_e32 v5, v9
; VI-NEXT: v_cvt_u32_f32_e32 v4, v4
; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_v4f32_to_v4i64:
@@ -631,14 +631,14 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in
;
; VI-LABEL: fp_to_uint_f32_to_i1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, s4
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT: v_cmp_eq_f32_e64 s[0:1], 1.0, s2
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_f32_to_i1:
@@ -681,14 +681,14 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa
;
; VI-LABEL: fp_to_uint_fabs_f32_to_i1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, |s4|
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT: v_cmp_eq_f32_e64 s[0:1], 1.0, |s2|
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_fabs_f32_to_i1:
@@ -732,12 +732,12 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i16(ptr addrspace(1) %out, float %i
; VI-LABEL: fp_to_uint_f32_to_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_u32_f32_e32 v0, s2
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_f32_to_i16:
diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
index 82c25c0..2c74b3d 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -25,38 +25,38 @@ define amdgpu_kernel void @fpext_f16_to_f32(
;
; GFX89-LABEL: fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -91,41 +91,41 @@ define amdgpu_kernel void @fpext_f16_to_f64(
;
; GFX89-LABEL: fpext_f16_to_f64:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX89-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fpext_f16_to_f64:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -161,42 +161,42 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32(
;
; GFX89-LABEL: fpext_v2f16_to_v2f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_dword v1, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v1
; GFX89-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fpext_v2f16_to_v2f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -234,38 +234,38 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64(
;
; GFX89-LABEL: fpext_v2f16_to_v2f64:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX89-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX89-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
; GFX89-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fpext_v2f16_to_v2f64:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -274,7 +274,7 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64(
; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -299,38 +299,27 @@ define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(ptr addrspace(1) %r, i32 %a)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
-; VI-LABEL: s_fneg_fpext_f16_to_f32:
-; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; VI-NEXT: s_endpgm
-;
-; GFX9-LABEL: s_fneg_fpext_f16_to_f32:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v0, s2
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; GFX9-NEXT: s_endpgm
+; GFX89-LABEL: s_fneg_fpext_f16_to_f32:
+; GFX89: ; %bb.0: ; %entry
+; GFX89-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX89-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -363,38 +352,38 @@ define amdgpu_kernel void @fneg_fpext_f16_to_f32(
;
; GFX89-LABEL: fneg_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fneg_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -429,38 +418,38 @@ define amdgpu_kernel void @fabs_fpext_f16_to_f32(
;
; GFX89-LABEL: fabs_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fabs_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -495,38 +484,38 @@ define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32(
;
; GFX89-LABEL: fneg_fabs_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
-; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fneg_fabs_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -568,45 +557,45 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32(
;
; GFX89-LABEL: fneg_multi_use_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -v0
; GFX89-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fneg_multi_use_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -v0
; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -649,45 +638,45 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32(
;
; GFX89-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -v0
; GFX89-NEXT: v_mul_f16_e64 v0, -v0, v0
-; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -v0
; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v0
-; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -729,45 +718,45 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32(
;
; GFX89-LABEL: fabs_multi_use_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v1, |v0|
; GFX89-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fabs_multi_use_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v1, |v0|
; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -810,45 +799,45 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32(
;
; GFX89-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v1, |v0|
; GFX89-NEXT: v_mul_f16_e64 v0, |v0|, v0
-; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v1, |v0|
; GFX11-NEXT: v_mul_f16_e64 v0, |v0|, v0
-; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -890,45 +879,45 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32(
;
; GFX89-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -|v0|
; GFX89-NEXT: v_or_b32_e32 v0, 0x8000, v0
-; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -|v0|
; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -972,45 +961,45 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32(
;
; GFX89-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -|v0|
; GFX89-NEXT: v_mul_f16_e64 v0, -|v0|, v0
-; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -|v0|
; GFX11-NEXT: v_mul_f16_e64 v0, -|v0|, v0
-; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1031,3 +1020,6 @@ entry:
declare half @llvm.fabs.f16(half) #1
attributes #1 = { nounwind readnone }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX9: {{.*}}
+; VI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
index 238010e..ca58708 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -25,38 +25,38 @@ define amdgpu_kernel void @fptosi_f16_to_i16(
;
; VI-LABEL: fptosi_f16_to_i16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_i16_f16_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptosi_f16_to_i16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_i16_f16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -91,41 +91,41 @@ define amdgpu_kernel void @fptosi_f16_to_i32(
;
; VI-LABEL: fptosi_f16_to_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
; VI-NEXT: v_cvt_i32_f32_e32 v0, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptosi_f16_to_i32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -164,43 +164,43 @@ define amdgpu_kernel void @fptosi_f16_to_i64(
;
; VI-LABEL: fptosi_f16_to_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
; VI-NEXT: v_cvt_i32_f32_e32 v0, v0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptosi_f16_to_i64:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -241,37 +241,37 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16(
;
; VI-LABEL: fptosi_v2f16_to_v2i16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_i16_f16_e32 v1, v0
; VI-NEXT: v_cvt_i16_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptosi_v2f16_to_v2i16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_i16_f16_e32 v0, v0
@@ -280,7 +280,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16(
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -318,38 +318,38 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32(
;
; VI-LABEL: fptosi_v2f16_to_v2i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_cvt_i32_f32_e32 v0, v1
; VI-NEXT: v_cvt_i32_f32_e32 v1, v2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptosi_v2f16_to_v2i32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -358,7 +358,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32(
; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -401,17 +401,17 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64(
;
; VI-LABEL: fptosi_v2f16_to_v2i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -419,22 +419,22 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64(
; VI-NEXT: v_cvt_i32_f32_e32 v2, v2
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptosi_v2f16_to_v2i64:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -446,7 +446,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64(
; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -475,28 +475,28 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) {
;
; VI-LABEL: fptosi_f16_to_i1:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_eq_f16_e64 s[4:5], -1.0, s4
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT: v_cmp_eq_f16_e64 s[0:1], -1.0, s2
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptosi_f16_to_i1:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_f16_e64 s2, -1.0, s2
+; GFX11-NEXT: v_cmp_eq_f16_e64 s0, -1.0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: buffer_store_b8 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
index 1116dc9..2d5ae03 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -25,38 +25,38 @@ define amdgpu_kernel void @fptoui_f16_to_i16(
;
; VI-LABEL: fptoui_f16_to_i16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_u16_f16_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptoui_f16_to_i16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_u16_f16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -91,41 +91,41 @@ define amdgpu_kernel void @fptoui_f16_to_i32(
;
; VI-LABEL: fptoui_f16_to_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptoui_f16_to_i32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -164,43 +164,43 @@ define amdgpu_kernel void @fptoui_f16_to_i64(
;
; VI-LABEL: fptoui_f16_to_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptoui_f16_to_i64:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -240,37 +240,37 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16(
;
; VI-LABEL: fptoui_v2f16_to_v2i16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_u16_f16_e32 v1, v0
; VI-NEXT: v_cvt_u16_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptoui_v2f16_to_v2i16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_u16_f16_e32 v0, v0
@@ -279,7 +279,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16(
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -317,38 +317,38 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i32(
;
; VI-LABEL: fptoui_v2f16_to_v2i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_cvt_u32_f32_e32 v0, v1
; VI-NEXT: v_cvt_u32_f32_e32 v1, v2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptoui_v2f16_to_v2i32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -357,7 +357,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i32(
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -400,17 +400,17 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
;
; VI-LABEL: fptoui_v2f16_to_v2i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -418,22 +418,22 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
; VI-NEXT: v_cvt_u32_f32_e32 v2, v2
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptoui_v2f16_to_v2i64:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -444,7 +444,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX11-NEXT: v_mov_b32_e32 v3, v1
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -473,28 +473,28 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) {
;
; VI-LABEL: fptoui_f16_to_i1:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_eq_f16_e64 s[4:5], 1.0, s4
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT: v_cmp_eq_f16_e64 s[0:1], 1.0, s2
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptoui_f16_to_i1:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_f16_e64 s2, 1.0, s2
+; GFX11-NEXT: v_cmp_eq_f16_e64 s0, 1.0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: buffer_store_b8 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index 6cc7368..3873036 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -41,94 +41,94 @@ define amdgpu_kernel void @fptrunc_f32_to_f16(
;
; VI-SDAG-LABEL: fptrunc_f32_to_f16:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_f32_to_f16:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; VI-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fptrunc_f32_to_f16:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fptrunc_f32_to_f16:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_f32_to_f16:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_f32_to_f16:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -176,102 +176,102 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
;
; VI-SDAG-LABEL: fptrunc_f64_to_f16:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_f64_to_f16:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fptrunc_f64_to_f16:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_f64_to_f16:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -323,109 +323,109 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
;
; VI-SDAG-LABEL: fptrunc_v2f32_to_v2f16:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_v2f32_to_v2f16:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
; VI-GISEL-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
; VI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fptrunc_v2f32_to_v2f16:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fptrunc_v2f32_to_v2f16:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v1, s3
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v1, s1
; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_v2f32_to_v2f16:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_v2f32_to_v2f16:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v1, s3
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v1, s1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -481,93 +481,93 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
;
; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; VI-SDAG-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-GISEL-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
+; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, v[2:3]
@@ -577,27 +577,27 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -643,94 +643,94 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
;
; VI-SDAG-LABEL: fneg_fptrunc_f32_to_f16:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; VI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fneg_fptrunc_f32_to_f16:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s2
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s0
+; VI-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fneg_fptrunc_f32_to_f16:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fneg_fptrunc_f32_to_f16:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s2
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s0
+; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fneg_fptrunc_f32_to_f16:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fneg_fptrunc_f32_to_f16:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s2
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s0
+; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -777,94 +777,94 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
;
; VI-SDAG-LABEL: fabs_fptrunc_f32_to_f16:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0|
-; VI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fabs_fptrunc_f32_to_f16:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2|
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s0|
+; VI-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fabs_fptrunc_f32_to_f16:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0|
-; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fabs_fptrunc_f32_to_f16:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2|
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s0|
+; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fabs_fptrunc_f32_to_f16:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0|
-; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fabs_fptrunc_f32_to_f16:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2|
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s0|
+; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -911,94 +911,94 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
;
; VI-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -|v0|
-; VI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s2|
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s0|
+; VI-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -|v0|
-; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s2|
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s0|
+; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -|v0|
-; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s2|
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s0|
+; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1046,98 +1046,98 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
;
; VI-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; VI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1185,98 +1185,98 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
;
; VI-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0|
-; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2|
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s0|
+; VI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0|
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2|
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s0|
+; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0|
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2|
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s0|
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1327,102 +1327,102 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32(
;
; VI-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
-; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
+; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
; VI-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16
-; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
; GFX9-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index e4aa4d1..bcef7bc 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -28,66 +28,66 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in)
;
; VI-SDAG-LABEL: fptrunc_f64_to_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
-; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
+; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_f64_to_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
-; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
+; VI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: fptrunc_f64_to_f32:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
-; GFX10-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; GFX10-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX10-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: fptrunc_f64_to_f32:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; GFX10-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_f64_to_f32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
-; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_f64_to_f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -218,358 +218,356 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
;
; VI-SAFE-GISEL-LABEL: fptrunc_f64_to_f16:
; VI-SAFE-GISEL: ; %bb.0:
-; VI-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SAFE-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
-; VI-SAFE-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; VI-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
-; VI-SAFE-GISEL-NEXT: s_addk_i32 s4, 0xfc10
-; VI-SAFE-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
-; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s6, s2
+; VI-SAFE-GISEL-NEXT: s_bfe_u32 s0, s7, 0xb0014
+; VI-SAFE-GISEL-NEXT: s_lshr_b32 s1, s7, 8
+; VI-SAFE-GISEL-NEXT: s_and_b32 s2, s7, 0x1ff
+; VI-SAFE-GISEL-NEXT: s_addk_i32 s0, 0xfc10
+; VI-SAFE-GISEL-NEXT: s_and_b32 s1, s1, 0xffe
+; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s2, s6
; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s5, s2
-; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; VI-SAFE-GISEL-NEXT: s_sub_i32 s7, 1, s4
-; VI-SAFE-GISEL-NEXT: s_lshl_b32 s6, s4, 12
-; VI-SAFE-GISEL-NEXT: s_max_i32 s7, s7, 0
-; VI-SAFE-GISEL-NEXT: s_or_b32 s6, s2, s6
-; VI-SAFE-GISEL-NEXT: s_min_i32 s7, s7, 13
-; VI-SAFE-GISEL-NEXT: s_bitset1_b32 s2, 12
-; VI-SAFE-GISEL-NEXT: s_lshl_b32 s5, s5, 9
-; VI-SAFE-GISEL-NEXT: s_lshr_b32 s8, s2, s7
-; VI-SAFE-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
-; VI-SAFE-GISEL-NEXT: s_lshl_b32 s7, s8, s7
-; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s7, s2
+; VI-SAFE-GISEL-NEXT: s_or_b32 s1, s1, s2
+; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s8, s2
-; VI-SAFE-GISEL-NEXT: s_cmp_lt_i32 s4, 1
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, s2, s6
-; VI-SAFE-GISEL-NEXT: s_and_b32 s6, s2, 7
-; VI-SAFE-GISEL-NEXT: s_lshr_b32 s2, s2, 2
-; VI-SAFE-GISEL-NEXT: s_cmp_eq_u32 s6, 3
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s7, 1, 0
-; VI-SAFE-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; VI-SAFE-GISEL-NEXT: s_sub_i32 s6, 1, s0
+; VI-SAFE-GISEL-NEXT: s_lshl_b32 s3, s0, 12
+; VI-SAFE-GISEL-NEXT: s_max_i32 s6, s6, 0
+; VI-SAFE-GISEL-NEXT: s_or_b32 s3, s1, s3
+; VI-SAFE-GISEL-NEXT: s_min_i32 s6, s6, 13
+; VI-SAFE-GISEL-NEXT: s_bitset1_b32 s1, 12
+; VI-SAFE-GISEL-NEXT: s_lshl_b32 s2, s2, 9
+; VI-SAFE-GISEL-NEXT: s_lshr_b32 s8, s1, s6
+; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s2, 0x7c00
+; VI-SAFE-GISEL-NEXT: s_lshl_b32 s6, s8, s6
+; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s6, s1
+; VI-SAFE-GISEL-NEXT: s_cselect_b32 s1, 1, 0
+; VI-SAFE-GISEL-NEXT: s_or_b32 s1, s8, s1
+; VI-SAFE-GISEL-NEXT: s_cmp_lt_i32 s0, 1
+; VI-SAFE-GISEL-NEXT: s_cselect_b32 s1, s1, s3
+; VI-SAFE-GISEL-NEXT: s_and_b32 s3, s1, 7
+; VI-SAFE-GISEL-NEXT: s_lshr_b32 s1, s1, 2
+; VI-SAFE-GISEL-NEXT: s_cmp_eq_u32 s3, 3
; VI-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
-; VI-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6
-; VI-SAFE-GISEL-NEXT: s_and_b32 s6, s6, 1
-; VI-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6
-; VI-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
-; VI-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, s5, s2
-; VI-SAFE-GISEL-NEXT: s_lshr_b32 s3, s3, 16
-; VI-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
-; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s3, s2
-; VI-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-SAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-SAFE-GISEL-NEXT: s_mov_b32 s3, 0xf000
-; VI-SAFE-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-SAFE-GISEL-NEXT: s_cmp_gt_i32 s3, 5
+; VI-SAFE-GISEL-NEXT: s_cselect_b32 s3, 1, 0
+; VI-SAFE-GISEL-NEXT: s_or_b32 s3, s6, s3
+; VI-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 1
+; VI-SAFE-GISEL-NEXT: s_add_i32 s1, s1, s3
+; VI-SAFE-GISEL-NEXT: s_cmp_gt_i32 s0, 30
+; VI-SAFE-GISEL-NEXT: s_cselect_b32 s1, 0x7c00, s1
+; VI-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s0, 0x40f
+; VI-SAFE-GISEL-NEXT: s_cselect_b32 s0, s2, s1
+; VI-SAFE-GISEL-NEXT: s_lshr_b32 s1, s7, 16
+; VI-SAFE-GISEL-NEXT: s_and_b32 s1, s1, 0x8000
+; VI-SAFE-GISEL-NEXT: s_or_b32 s0, s1, s0
+; VI-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-SAFE-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-SAFE-GISEL-NEXT: s_mov_b32 s7, 0xf000
+; VI-SAFE-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-SAFE-GISEL-NEXT: s_endpgm
;
; VI-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16:
; VI-UNSAFE-SDAG: ; %bb.0:
-; VI-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0xf000
-; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s7, 0xf000
+; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s6, -1
; VI-UNSAFE-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-UNSAFE-SDAG-NEXT: s_endpgm
;
; VI-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16:
; VI-UNSAFE-GISEL: ; %bb.0:
-; VI-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; VI-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-UNSAFE-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; VI-UNSAFE-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-UNSAFE-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-UNSAFE-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-UNSAFE-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-UNSAFE-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-UNSAFE-GISEL-NEXT: s_endpgm
;
; GFX10-SAFE-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX10-SAFE-SDAG: ; %bb.0:
-; GFX10-SAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SAFE-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SAFE-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff
-; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s5, s3, 8
-; GFX10-SAFE-SDAG-NEXT: s_or_b32 s2, s4, s2
-; GFX10-SAFE-SDAG-NEXT: s_and_b32 s4, s5, 0xffe
-; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s2, 0
-; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, -1, 0
-; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX10-SAFE-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014
-; GFX10-SAFE-SDAG-NEXT: s_sub_i32 s5, 0x3f1, s2
-; GFX10-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10
-; GFX10-SAFE-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13
-; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0
-; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s7, s2, 12
-; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v1
-; GFX10-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s5
-; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000
-; GFX10-SAFE-SDAG-NEXT: s_or_b32 s7, s4, s7
-; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s6, s5, s6
-; GFX10-SAFE-SDAG-NEXT: v_lshlrev_b32_e64 v0, v1, s6
-; GFX10-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; GFX10-SAFE-SDAG-NEXT: s_and_b32 s0, s7, 0x1ff
+; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s1, s7, 8
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s0, s0, s6
+; GFX10-SAFE-SDAG-NEXT: s_and_b32 s1, s1, 0xffe
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s0, -1, 0
+; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-SAFE-SDAG-NEXT: s_bfe_u32 s0, s7, 0xb0014
+; GFX10-SAFE-SDAG-NEXT: s_sub_i32 s2, 0x3f1, s0
+; GFX10-SAFE-SDAG-NEXT: s_addk_i32 s0, 0xfc10
+; GFX10-SAFE-SDAG-NEXT: v_med3_i32 v1, s2, 0, 13
+; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s6, s0, 12
+; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s3, v1
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s1, s1, s2
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s2, s1, 0x1000
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s1, s6
+; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s3, s2, s3
+; GFX10-SAFE-SDAG-NEXT: v_lshlrev_b32_e64 v0, v1, s3
+; GFX10-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, s2, v0
; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0
-; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s6, s5
-; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1
-; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s7
-; GFX10-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7
-; GFX10-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5
-; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s7, -1, 0
-; GFX10-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3
+; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s2, s3, s2
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s0, 1
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, s2, s6
+; GFX10-SAFE-SDAG-NEXT: s_and_b32 s3, s2, 7
+; GFX10-SAFE-SDAG-NEXT: s_cmp_gt_i32 s3, 5
; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s6, -1, 0
-; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2
-; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7
-; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, 0
-; GFX10-SAFE-SDAG-NEXT: s_addc_u32 s5, s5, 0
-; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31
-; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00
-; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0
-; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s4, -1, 0
-; GFX10-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f
-; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10-SAFE-SDAG-NEXT: s_cmp_eq_u32 s3, 3
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s3, -1, 0
+; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s2, s2, 2
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s3, s3, s6
+; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s3, 0
+; GFX10-SAFE-SDAG-NEXT: s_addc_u32 s2, s2, 0
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s0, 31
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, s2, 0x7c00
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s1, -1, 0
+; GFX10-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s0, 0x40f
+; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s2, s3, 16
-; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-SAFE-SDAG-NEXT: s_and_b32 s2, s2, 0x8000
+; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s0, s7, 16
+; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-SAFE-SDAG-NEXT: s_and_b32 s0, s0, 0x8000
; GFX10-SAFE-SDAG-NEXT: v_lshlrev_b32_e32 v0, 9, v0
; GFX10-SAFE-SDAG-NEXT: v_or_b32_e32 v0, 0x7c00, v0
-; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v0, s5, v0, vcc_lo
-; GFX10-SAFE-SDAG-NEXT: v_or_b32_e32 v0, s2, v0
-; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s2, -1
-; GFX10-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
+; GFX10-SAFE-SDAG-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX10-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX10-SAFE-SDAG-NEXT: s_endpgm
;
; GFX10-SAFE-GISEL-LABEL: fptrunc_f64_to_f16:
; GFX10-SAFE-GISEL: ; %bb.0:
-; GFX10-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SAFE-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
-; GFX10-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
-; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s6, s2
-; GFX10-SAFE-GISEL-NEXT: s_addk_i32 s4, 0xfc10
-; GFX10-SAFE-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX10-SAFE-GISEL-NEXT: s_and_b32 s2, s7, 0x1ff
+; GFX10-SAFE-GISEL-NEXT: s_bfe_u32 s0, s7, 0xb0014
+; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s1, s7, 8
+; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s2, s6
+; GFX10-SAFE-GISEL-NEXT: s_addk_i32 s0, 0xfc10
+; GFX10-SAFE-GISEL-NEXT: s_and_b32 s1, s1, 0xffe
; GFX10-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s5, s2
-; GFX10-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_sub_i32 s6, 1, s4
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
-; GFX10-SAFE-GISEL-NEXT: s_max_i32 s6, s6, 0
-; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s7, s4, 12
-; GFX10-SAFE-GISEL-NEXT: s_min_i32 s6, s6, 13
-; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s5, s5, 9
-; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s9, s8, s6
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s2, s7
-; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s6, s9, s6
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
-; GFX10-SAFE-GISEL-NEXT: s_cmp_lg_u32 s6, s8
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s6, s9, s6
-; GFX10-SAFE-GISEL-NEXT: s_cmp_lt_i32 s4, 1
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, s6, s2
-; GFX10-SAFE-GISEL-NEXT: s_and_b32 s6, s2, 7
-; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s2, s2, 2
-; GFX10-SAFE-GISEL-NEXT: s_cmp_eq_u32 s6, 3
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s7, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; GFX10-SAFE-GISEL-NEXT: s_or_b32 s1, s1, s2
+; GFX10-SAFE-GISEL-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GFX10-SAFE-GISEL-NEXT: s_sub_i32 s3, 1, s0
+; GFX10-SAFE-GISEL-NEXT: s_or_b32 s8, s1, 0x1000
+; GFX10-SAFE-GISEL-NEXT: s_max_i32 s3, s3, 0
+; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s6, s0, 12
+; GFX10-SAFE-GISEL-NEXT: s_min_i32 s3, s3, 13
+; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s2, s2, 9
+; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s9, s8, s3
+; GFX10-SAFE-GISEL-NEXT: s_or_b32 s1, s1, s6
+; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s3, s9, s3
+; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s2, 0x7c00
+; GFX10-SAFE-GISEL-NEXT: s_cmp_lg_u32 s3, s8
+; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s3, 1, 0
+; GFX10-SAFE-GISEL-NEXT: s_or_b32 s3, s9, s3
+; GFX10-SAFE-GISEL-NEXT: s_cmp_lt_i32 s0, 1
+; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s1, s3, s1
+; GFX10-SAFE-GISEL-NEXT: s_and_b32 s3, s1, 7
+; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s1, s1, 2
+; GFX10-SAFE-GISEL-NEXT: s_cmp_eq_u32 s3, 3
; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6
-; GFX10-SAFE-GISEL-NEXT: s_and_b32 s6, s6, 1
-; GFX10-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6
-; GFX10-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
-; GFX10-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, s5, s2
-; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s3, s3, 16
-; GFX10-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s3, s2
-; GFX10-SAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-SAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-SAFE-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX10-SAFE-GISEL-NEXT: s_cmp_gt_i32 s3, 5
+; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s3, 1, 0
+; GFX10-SAFE-GISEL-NEXT: s_or_b32 s3, s6, s3
+; GFX10-SAFE-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX10-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 1
+; GFX10-SAFE-GISEL-NEXT: s_add_i32 s1, s1, s3
+; GFX10-SAFE-GISEL-NEXT: s_cmp_gt_i32 s0, 30
+; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s1, 0x7c00, s1
+; GFX10-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s0, 0x40f
+; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s0, s2, s1
+; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s1, s7, 16
+; GFX10-SAFE-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-SAFE-GISEL-NEXT: s_and_b32 s1, s1, 0x8000
+; GFX10-SAFE-GISEL-NEXT: s_or_b32 s0, s1, s0
+; GFX10-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-SAFE-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX10-SAFE-GISEL-NEXT: s_endpgm
;
; GFX10-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX10-UNSAFE-SDAG: ; %bb.0:
-; GFX10-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX10-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s6, -1
; GFX10-UNSAFE-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX10-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX10-UNSAFE-SDAG-NEXT: s_endpgm
;
; GFX10-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16:
; GFX10-UNSAFE-GISEL: ; %bb.0:
-; GFX10-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX10-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-UNSAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; GFX10-UNSAFE-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX10-UNSAFE-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX10-UNSAFE-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-UNSAFE-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX10-UNSAFE-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX10-UNSAFE-GISEL-NEXT: s_endpgm
;
; GFX11-SAFE-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX11-SAFE-SDAG: ; %bb.0:
-; GFX11-SAFE-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SAFE-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SAFE-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff
-; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-SAFE-SDAG-NEXT: s_or_b32 s2, s4, s2
-; GFX11-SAFE-SDAG-NEXT: s_and_b32 s4, s5, 0xffe
-; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-SAFE-SDAG-NEXT: s_and_b32 s0, s7, 0x1ff
+; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s1, s7, 8
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s0, s0, s6
+; GFX11-SAFE-SDAG-NEXT: s_and_b32 s1, s1, 0xffe
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s0, -1, 0
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-SAFE-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014
-; GFX11-SAFE-SDAG-NEXT: s_sub_i32 s5, 0x3f1, s2
-; GFX11-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10
-; GFX11-SAFE-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13
-; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0
-; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s7, s2, 12
+; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-SAFE-SDAG-NEXT: s_bfe_u32 s0, s7, 0xb0014
+; GFX11-SAFE-SDAG-NEXT: s_sub_i32 s2, 0x3f1, s0
+; GFX11-SAFE-SDAG-NEXT: s_addk_i32 s0, 0xfc10
+; GFX11-SAFE-SDAG-NEXT: v_med3_i32 v1, s2, 0, 13
+; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s6, s0, 12
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v1
-; GFX11-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s5
+; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s1, s1, s2
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000
-; GFX11-SAFE-SDAG-NEXT: s_or_b32 s7, s4, s7
-; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s6, s5, s6
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s2, s1, 0x1000
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s1, s6
+; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s3, s2, s3
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-SDAG-NEXT: v_lshlrev_b32_e64 v0, v1, s6
-; GFX11-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; GFX11-SAFE-SDAG-NEXT: v_lshlrev_b32_e64 v0, v1, s3
+; GFX11-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, s2, v0
; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0
-; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s6, s5
-; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1
-; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s7
+; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s2, s3, s2
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s0, 1
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, s2, s6
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7
-; GFX11-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5
-; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s7, -1, 0
-; GFX11-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3
+; GFX11-SAFE-SDAG-NEXT: s_and_b32 s3, s2, 7
+; GFX11-SAFE-SDAG-NEXT: s_cmp_gt_i32 s3, 5
; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s6, -1, 0
-; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2
-; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7
-; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, 0
-; GFX11-SAFE-SDAG-NEXT: s_addc_u32 s5, s5, 0
-; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31
-; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00
-; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0
-; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s4, -1, 0
-; GFX11-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f
-; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX11-SAFE-SDAG-NEXT: s_cmp_eq_u32 s3, 3
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s3, -1, 0
+; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s2, s2, 2
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s3, s3, s6
+; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-SAFE-SDAG-NEXT: s_addc_u32 s2, s2, 0
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s0, 31
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, s2, 0x7c00
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s1, -1, 0
+; GFX11-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s0, 0x40f
+; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s2, s3, 16
-; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-SAFE-SDAG-NEXT: s_and_b32 s2, s2, 0x8000
+; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s0, s7, 16
+; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SAFE-SDAG-NEXT: s_and_b32 s0, s0, 0x8000
; GFX11-SAFE-SDAG-NEXT: v_lshlrev_b32_e32 v0, 9, v0
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SAFE-SDAG-NEXT: v_or_b32_e32 v0, 0x7c00, v0
-; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v0, s5, v0, vcc_lo
+; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SAFE-SDAG-NEXT: v_or_b32_e32 v0, s2, v0
-; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s2, -1
-; GFX11-SAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-SAFE-SDAG-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX11-SAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-SAFE-SDAG-NEXT: s_nop 0
; GFX11-SAFE-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SAFE-SDAG-NEXT: s_endpgm
;
; GFX11-SAFE-GISEL-LABEL: fptrunc_f64_to_f16:
; GFX11-SAFE-GISEL: ; %bb.0:
-; GFX11-SAFE-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SAFE-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
-; GFX11-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
-; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s6, s2
-; GFX11-SAFE-GISEL-NEXT: s_addk_i32 s4, 0xfc10
-; GFX11-SAFE-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-SAFE-GISEL-NEXT: s_and_b32 s2, s7, 0x1ff
+; GFX11-SAFE-GISEL-NEXT: s_bfe_u32 s0, s7, 0xb0014
+; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s1, s7, 8
+; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s2, s6
+; GFX11-SAFE-GISEL-NEXT: s_addk_i32 s0, 0xfc10
+; GFX11-SAFE-GISEL-NEXT: s_and_b32 s1, s1, 0xffe
; GFX11-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s5, s2
-; GFX11-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; GFX11-SAFE-GISEL-NEXT: s_sub_i32 s6, 1, s4
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
-; GFX11-SAFE-GISEL-NEXT: s_max_i32 s6, s6, 0
-; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s7, s4, 12
-; GFX11-SAFE-GISEL-NEXT: s_min_i32 s6, s6, 13
-; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s5, s5, 9
-; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s9, s8, s6
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s2, s7
-; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s6, s9, s6
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
-; GFX11-SAFE-GISEL-NEXT: s_cmp_lg_u32 s6, s8
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-SAFE-GISEL-NEXT: s_or_b32 s1, s1, s2
+; GFX11-SAFE-GISEL-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GFX11-SAFE-GISEL-NEXT: s_sub_i32 s3, 1, s0
+; GFX11-SAFE-GISEL-NEXT: s_or_b32 s8, s1, 0x1000
+; GFX11-SAFE-GISEL-NEXT: s_max_i32 s3, s3, 0
+; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s6, s0, 12
+; GFX11-SAFE-GISEL-NEXT: s_min_i32 s3, s3, 13
+; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s2, s2, 9
+; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s9, s8, s3
+; GFX11-SAFE-GISEL-NEXT: s_or_b32 s1, s1, s6
+; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s3, s9, s3
+; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s2, 0x7c00
+; GFX11-SAFE-GISEL-NEXT: s_cmp_lg_u32 s3, s8
+; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s3, 1, 0
; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s6, s9, s6
-; GFX11-SAFE-GISEL-NEXT: s_cmp_lt_i32 s4, 1
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, s6, s2
-; GFX11-SAFE-GISEL-NEXT: s_and_b32 s6, s2, 7
-; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s2, s2, 2
-; GFX11-SAFE-GISEL-NEXT: s_cmp_eq_u32 s6, 3
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s7, 1, 0
-; GFX11-SAFE-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; GFX11-SAFE-GISEL-NEXT: s_or_b32 s3, s9, s3
+; GFX11-SAFE-GISEL-NEXT: s_cmp_lt_i32 s0, 1
+; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s1, s3, s1
+; GFX11-SAFE-GISEL-NEXT: s_and_b32 s3, s1, 7
+; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s1, s1, 2
+; GFX11-SAFE-GISEL-NEXT: s_cmp_eq_u32 s3, 3
; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-SAFE-GISEL-NEXT: s_cmp_gt_i32 s3, 5
+; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s3, 1, 0
+; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-GISEL-NEXT: s_or_b32 s3, s6, s3
+; GFX11-SAFE-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 1
+; GFX11-SAFE-GISEL-NEXT: s_add_i32 s1, s1, s3
+; GFX11-SAFE-GISEL-NEXT: s_cmp_gt_i32 s0, 30
+; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s1, 0x7c00, s1
+; GFX11-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s0, 0x40f
+; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s0, s2, s1
+; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s1, s7, 16
+; GFX11-SAFE-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SAFE-GISEL-NEXT: s_and_b32 s1, s1, 0x8000
; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6
-; GFX11-SAFE-GISEL-NEXT: s_and_b32 s6, s6, 1
-; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6
-; GFX11-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
-; GFX11-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, s5, s2
-; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s3, s3, 16
-; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s3, s2
-; GFX11-SAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-SAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-SAFE-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-SAFE-GISEL-NEXT: s_or_b32 s0, s1, s0
+; GFX11-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-SAFE-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-SAFE-GISEL-NEXT: s_nop 0
; GFX11-SAFE-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SAFE-GISEL-NEXT: s_endpgm
;
; GFX11-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX11-UNSAFE-SDAG: ; %bb.0:
-; GFX11-UNSAFE-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-UNSAFE-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX11-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-UNSAFE-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; GFX11-UNSAFE-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-UNSAFE-SDAG-NEXT: s_mov_b32 s6, -1
; GFX11-UNSAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-UNSAFE-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-UNSAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-UNSAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-UNSAFE-SDAG-NEXT: s_nop 0
; GFX11-UNSAFE-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-UNSAFE-SDAG-NEXT: s_endpgm
;
; GFX11-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16:
; GFX11-UNSAFE-GISEL: ; %bb.0:
-; GFX11-UNSAFE-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-UNSAFE-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX11-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-UNSAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; GFX11-UNSAFE-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-UNSAFE-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-UNSAFE-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-UNSAFE-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-UNSAFE-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-UNSAFE-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-UNSAFE-GISEL-NEXT: s_nop 0
; GFX11-UNSAFE-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-UNSAFE-GISEL-NEXT: s_endpgm
@@ -595,79 +593,79 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do
; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s11, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s10, -1
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; VI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f32:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-GISEL-NEXT: s_mov_b32 s10, -1
+; VI-GISEL-NEXT: s_mov_b32 s11, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
-; VI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: fptrunc_v2f64_to_v2f32:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_clause 0x1
; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX10-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: fptrunc_v2f64_to_v2f32:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
-; GFX10-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX11-SDAG-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_v2f64_to_v2f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
-; GFX11-GISEL-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -696,87 +694,89 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s11, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s10, -1
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[2:3]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s2, -1
-; VI-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; VI-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[8:11], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_v3f64_to_v3f32:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
+; VI-GISEL-NEXT: s_mov_b32 s14, -1
+; VI-GISEL-NEXT: s_mov_b32 s15, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
-; VI-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; VI-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[12:15], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: fptrunc_v3f64_to_v3f32:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_clause 0x2
+; GFX10-SDAG-NEXT: s_clause 0x1
; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[2:3]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
-; GFX10-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: fptrunc_v3f64_to_v3f32:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
-; GFX10-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_v3f64_to_v3f32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_clause 0x2
+; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x54
; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x44
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[2:3]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
-; GFX11-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[4:7], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_v3f64_to_v3f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x44
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
-; GFX11-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0
+; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -803,91 +803,91 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do
; VI-SDAG-LABEL: fptrunc_v4f64_to_v4f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s15, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s14, -1
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[10:11]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; VI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_v4f64_to_v4f32:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
+; VI-GISEL-NEXT: s_mov_b32 s14, -1
+; VI-GISEL-NEXT: s_mov_b32 s15, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[10:11]
-; VI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: fptrunc_v4f64_to_v4f32:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_clause 0x1
; GFX10-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[10:11]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: fptrunc_v4f64_to_v4f32:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[10:11]
-; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_v4f64_to_v4f32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x44
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[10:11]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX11-SDAG-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_v4f64_to_v4f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x44
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[10:11]
-; GFX11-GISEL-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -919,9 +919,9 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
; VI-SDAG-LABEL: fptrunc_v8f64_to_v8f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s23, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s22, -1
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v7, s[18:19]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v6, s[16:17]
@@ -931,16 +931,16 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; VI-SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; VI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16
+; VI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_v8f64_to_v8f32:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x24
+; VI-GISEL-NEXT: s_mov_b32 s22, -1
+; VI-GISEL-NEXT: s_mov_b32 s23, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
@@ -950,17 +950,13 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v5, s[14:15]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v6, s[16:17]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v7, s[18:19]
-; VI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; VI-GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0
+; VI-GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16
; VI-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: fptrunc_v8f64_to_v8f32:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_clause 0x1
; GFX10-SDAG-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v7, s[18:19]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v6, s[16:17]
@@ -970,17 +966,17 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: fptrunc_v8f64_to_v8f32:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
@@ -990,17 +986,17 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v5, s[14:15]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v6, s[16:17]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v7, s[18:19]
-; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_v8f64_to_v8f32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b512 s[4:19], s[0:1], 0x64
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v7, s[18:19]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v6, s[16:17]
@@ -1010,20 +1006,20 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
+; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_clause 0x1
-; GFX11-SDAG-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16
-; GFX11-SDAG-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-SDAG-NEXT: buffer_store_b128 v[4:7], off, s[4:7], 0 offset:16
+; GFX11-SDAG-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_v8f64_to_v8f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b512 s[4:19], s[0:1], 0x64
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
@@ -1033,9 +1029,13 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v5, s[14:15]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v6, s[16:17]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v7, s[18:19]
+; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_clause 0x1
-; GFX11-GISEL-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
-; GFX11-GISEL-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16
+; GFX11-GISEL-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-GISEL-NEXT: buffer_store_b128 v[4:7], off, s[4:7], 0 offset:16
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 0d59021..c7e284d 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -93,12 +93,12 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-LABEL: frem_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 8
+; VI-NEXT: s_add_u32 s0, s2, 8
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_load_ushort v4, v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -160,12 +160,12 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
+; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] offset:8
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -185,12 +185,12 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v0, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX1150-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
+; GFX1150-NEXT: global_load_u16 v2, v0, s[2:3] offset:8
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
@@ -277,12 +277,12 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; VI-LABEL: fast_frem_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 8
+; VI-NEXT: s_add_u32 s0, s2, 8
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_load_ushort v4, v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -335,12 +335,12 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
+; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] offset:8
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v3, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -357,12 +357,12 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v0, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX1150-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
+; GFX1150-NEXT: global_load_u16 v2, v0, s[2:3] offset:8
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_rcp_f16_e32 v3, v2
; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -446,12 +446,12 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; VI-LABEL: unsafe_frem_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 8
+; VI-NEXT: s_add_u32 s0, s2, 8
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_load_ushort v4, v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -504,12 +504,12 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
+; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] offset:8
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v3, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -526,12 +526,12 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v0, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX1150-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
+; GFX1150-NEXT: global_load_u16 v2, v0, s[2:3] offset:8
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_rcp_f16_e32 v3, v2
; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -629,12 +629,12 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-LABEL: frem_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 16
+; VI-NEXT: s_add_u32 s0, s2, 16
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_load_dword v4, v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -720,12 +720,12 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] offset:16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f32 v4, null, v2, v2, v1
; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1
@@ -757,12 +757,12 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v0, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX1150-NEXT: global_load_b32 v2, v0, s[2:3] offset:16
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_div_scale_f32 v4, null, v2, v2, v1
; GFX1150-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1
@@ -853,12 +853,12 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
; VI-LABEL: fast_frem_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 16
+; VI-NEXT: s_add_u32 s0, s2, 16
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_load_dword v4, v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -911,12 +911,12 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] offset:16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -933,12 +933,12 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v0, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX1150-NEXT: global_load_b32 v2, v0, s[2:3] offset:16
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_rcp_f32_e32 v3, v2
; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1014,12 +1014,12 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
; VI-LABEL: unsafe_frem_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 16
+; VI-NEXT: s_add_u32 s0, s2, 16
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_load_dword v4, v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -1072,12 +1072,12 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] offset:16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -1094,12 +1094,12 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v0, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX1150-NEXT: global_load_b32 v2, v0, s[2:3] offset:16
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_rcp_f32_e32 v3, v2
; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1218,12 +1218,12 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-LABEL: frem_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s4
@@ -1303,12 +1303,12 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v12, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v12, s[6:7]
-; GFX11-NEXT: global_load_b64 v[2:3], v12, s[0:1]
+; GFX11-NEXT: global_load_b64 v[2:3], v12, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -1338,12 +1338,12 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v12, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b64 v[0:1], v12, s[6:7]
-; GFX1150-NEXT: global_load_b64 v[2:3], v12, s[0:1]
+; GFX1150-NEXT: global_load_b64 v[2:3], v12, s[2:3]
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1]
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
@@ -1462,12 +1462,12 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
; VI-LABEL: fast_frem_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s4
@@ -1536,12 +1536,12 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v10, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v10, s[6:7]
-; GFX11-NEXT: global_load_b64 v[2:3], v10, s[0:1]
+; GFX11-NEXT: global_load_b64 v[2:3], v10, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -1567,12 +1567,12 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v10, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b64 v[0:1], v10, s[6:7]
-; GFX1150-NEXT: global_load_b64 v[2:3], v10, s[0:1]
+; GFX1150-NEXT: global_load_b64 v[2:3], v10, s[2:3]
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1687,12 +1687,12 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
; VI-LABEL: unsafe_frem_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s4
@@ -1761,12 +1761,12 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v10, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v10, s[6:7]
-; GFX11-NEXT: global_load_b64 v[2:3], v10, s[0:1]
+; GFX11-NEXT: global_load_b64 v[2:3], v10, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -1792,12 +1792,12 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v10, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b64 v[0:1], v10, s[6:7]
-; GFX1150-NEXT: global_load_b64 v[2:3], v10, s[0:1]
+; GFX1150-NEXT: global_load_b64 v[2:3], v10, s[2:3]
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1956,12 +1956,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: frem_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 16
+; VI-NEXT: s_add_u32 s0, s2, 16
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_load_dword v4, v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -2053,12 +2053,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] offset:16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -2091,12 +2091,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v0, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX1150-NEXT: global_load_b32 v2, v0, s[2:3] offset:16
; GFX1150-NEXT: s_waitcnt vmcnt(1)
; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX1150-NEXT: s_waitcnt vmcnt(0)
@@ -2346,11 +2346,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: frem_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s2, 32
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
@@ -2493,12 +2493,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7]
-; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32
+; GFX11-NEXT: global_load_b64 v[2:3], v4, s[2:3] offset:32
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -2553,12 +2553,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v4, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[6:7]
-; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32
+; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[2:3] offset:32
; GFX1150-NEXT: s_waitcnt vmcnt(1)
; GFX1150-NEXT: v_lshrrev_b32_e32 v7, 16, v0
; GFX1150-NEXT: s_waitcnt vmcnt(0)
@@ -2728,11 +2728,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: frem_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s2, 32
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_mov_b32_e32 v4, s0
@@ -2864,12 +2864,12 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7]
-; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32
+; GFX11-NEXT: global_load_b64 v[2:3], v4, s[2:3] offset:32
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f32 v6, null, v3, v3, v1
; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1
@@ -2922,12 +2922,12 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v4, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[6:7]
-; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32
+; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[2:3] offset:32
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_div_scale_f32 v6, null, v3, v3, v1
; GFX1150-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1
@@ -3152,11 +3152,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: frem_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_add_u32 s0, s0, 64
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s2, 64
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_mov_b32_e32 v4, s0
@@ -3378,12 +3378,12 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7]
-; GFX11-NEXT: global_load_b128 v[4:7], v8, s[0:1] offset:64
+; GFX11-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:64
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f32 v10, null, v7, v7, v3
; GFX11-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3
@@ -3478,12 +3478,12 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v8, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b128 v[0:3], v8, s[6:7]
-; GFX1150-NEXT: global_load_b128 v[4:7], v8, s[0:1] offset:64
+; GFX1150-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:64
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_div_scale_f32 v10, null, v7, v7, v3
; GFX1150-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3
@@ -3731,11 +3731,11 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: frem_v2f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_add_u32 s0, s0, 64
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s2, 64
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_mov_b32_e32 v4, s0
@@ -3859,12 +3859,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v16, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[0:3], v16, s[6:7]
-; GFX11-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:64
+; GFX11-NEXT: global_load_b128 v[4:7], v16, s[2:3] offset:64
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -3913,12 +3913,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v16, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b128 v[0:3], v16, s[6:7]
-; GFX1150-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:64
+; GFX1150-NEXT: global_load_b128 v[4:7], v16, s[2:3] offset:64
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3]
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index fecf303..2e36b53 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -210,22 +210,22 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s7
-; VI-NEXT: s_not_b32 s3, s3
-; VI-NEXT: s_lshr_b32 s7, s5, 1
+; VI-NEXT: s_not_b32 s1, s3
+; VI-NEXT: s_lshr_b32 s0, s5, 1
; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_alignbit_b32 v1, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_not_b32 s2, s2
+; VI-NEXT: s_not_b32 s1, s2
; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1
-; VI-NEXT: s_lshr_b32 s3, s4, 1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_alignbit_b32 v0, s3, v0, v2
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_lshr_b32 s0, s4, 1
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_alignbit_b32 v0, s0, v0, v2
+; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: v_mov_b32_e32 v3, s9
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -293,18 +293,18 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v0, s5, s7, 1
; GFX11-NEXT: v_alignbit_b32 v3, s4, s6, 1
-; GFX11-NEXT: s_lshr_b32 s5, s5, 1
-; GFX11-NEXT: s_not_b32 s3, s3
-; GFX11-NEXT: s_lshr_b32 s4, s4, 1
+; GFX11-NEXT: s_lshr_b32 s0, s5, 1
+; GFX11-NEXT: s_not_b32 s1, s3
+; GFX11-NEXT: s_lshr_b32 s3, s4, 1
; GFX11-NEXT: s_not_b32 s2, s2
-; GFX11-NEXT: v_alignbit_b32 v1, s5, v0, s3
-; GFX11-NEXT: v_alignbit_b32 v0, s4, v3, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_alignbit_b32 v1, s0, v0, s1
+; GFX11-NEXT: v_alignbit_b32 v0, s3, v3, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[8:9]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -332,14 +332,14 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; VI-LABEL: fshl_v2i32_imm:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_alignbit_b32 v1, s5, v0, 23
; VI-NEXT: v_alignbit_b32 v0, s4, v2, 25
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -386,12 +386,12 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 23
; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, 25
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -441,34 +441,34 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s11
-; VI-NEXT: s_not_b32 s3, s15
-; VI-NEXT: s_lshr_b32 s2, s7, 1
+; VI-NEXT: s_not_b32 s1, s15
+; VI-NEXT: s_lshr_b32 s0, s7, 1
; VI-NEXT: v_alignbit_b32 v0, s7, v0, 1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_alignbit_b32 v3, s2, v0, v1
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_alignbit_b32 v3, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s10
-; VI-NEXT: s_not_b32 s3, s14
+; VI-NEXT: s_not_b32 s1, s14
; VI-NEXT: v_alignbit_b32 v0, s6, v0, 1
-; VI-NEXT: s_lshr_b32 s2, s6, 1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_alignbit_b32 v2, s2, v0, v1
+; VI-NEXT: s_lshr_b32 s0, s6, 1
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_alignbit_b32 v2, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s9
-; VI-NEXT: s_not_b32 s3, s13
+; VI-NEXT: s_not_b32 s1, s13
; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1
-; VI-NEXT: s_lshr_b32 s2, s5, 1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_alignbit_b32 v1, s2, v0, v1
+; VI-NEXT: s_lshr_b32 s0, s5, 1
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_alignbit_b32 v1, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: s_not_b32 s3, s12
+; VI-NEXT: s_not_b32 s1, s12
; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1
-; VI-NEXT: s_lshr_b32 s2, s4, 1
-; VI-NEXT: v_mov_b32_e32 v4, s3
-; VI-NEXT: v_alignbit_b32 v0, s2, v0, v4
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_lshr_b32 s0, s4, 1
+; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_alignbit_b32 v0, s0, v0, v4
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -534,29 +534,29 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
;
; GFX10-LABEL: fshl_v4i32:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v4, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_alignbit_b32 v0, s7, s11, 1
; GFX10-NEXT: v_alignbit_b32 v1, s6, s10, 1
; GFX10-NEXT: v_alignbit_b32 v5, s5, s9, 1
; GFX10-NEXT: v_alignbit_b32 v6, s4, s8, 1
-; GFX10-NEXT: s_lshr_b32 s2, s7, 1
-; GFX10-NEXT: s_not_b32 s3, s15
+; GFX10-NEXT: s_lshr_b32 s0, s7, 1
+; GFX10-NEXT: s_not_b32 s1, s15
; GFX10-NEXT: s_lshr_b32 s6, s6, 1
; GFX10-NEXT: s_not_b32 s7, s14
; GFX10-NEXT: s_lshr_b32 s5, s5, 1
; GFX10-NEXT: s_not_b32 s9, s13
; GFX10-NEXT: s_lshr_b32 s4, s4, 1
; GFX10-NEXT: s_not_b32 s8, s12
-; GFX10-NEXT: v_alignbit_b32 v3, s2, v0, s3
+; GFX10-NEXT: v_alignbit_b32 v3, s0, v0, s1
; GFX10-NEXT: v_alignbit_b32 v2, s6, v1, s7
; GFX10-NEXT: v_alignbit_b32 v1, s5, v5, s9
; GFX10-NEXT: v_alignbit_b32 v0, s4, v6, s8
-; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fshl_v4i32:
@@ -564,26 +564,26 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x54
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v0, s7, s11, 1
; GFX11-NEXT: v_alignbit_b32 v1, s6, s10, 1
; GFX11-NEXT: v_alignbit_b32 v5, s5, s9, 1
; GFX11-NEXT: v_alignbit_b32 v6, s4, s8, 1
-; GFX11-NEXT: s_lshr_b32 s2, s7, 1
-; GFX11-NEXT: s_not_b32 s3, s15
+; GFX11-NEXT: s_lshr_b32 s0, s7, 1
+; GFX11-NEXT: s_not_b32 s1, s15
; GFX11-NEXT: s_lshr_b32 s6, s6, 1
; GFX11-NEXT: s_not_b32 s7, s14
; GFX11-NEXT: s_lshr_b32 s5, s5, 1
; GFX11-NEXT: s_not_b32 s9, s13
; GFX11-NEXT: s_lshr_b32 s4, s4, 1
; GFX11-NEXT: s_not_b32 s8, s12
-; GFX11-NEXT: v_alignbit_b32 v3, s2, v0, s3
+; GFX11-NEXT: v_alignbit_b32 v3, s0, v0, s1
; GFX11-NEXT: v_alignbit_b32 v2, s6, v1, s7
; GFX11-NEXT: v_alignbit_b32 v1, s5, v5, s9
; GFX11-NEXT: v_alignbit_b32 v0, s4, v6, s8
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -615,7 +615,7 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; VI-LABEL: fshl_v4i32_imm:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: v_mov_b32_e32 v1, s10
@@ -624,17 +624,17 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; VI-NEXT: v_alignbit_b32 v2, s6, v1, 23
; VI-NEXT: v_alignbit_b32 v1, s5, v4, 25
; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_alignbit_b32 v0, s4, v0, 31
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fshl_v4i32_imm:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s11
; GFX9-NEXT: v_mov_b32_e32 v1, s10
@@ -644,7 +644,7 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 25
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 31
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshl_v4i32_imm:
@@ -683,14 +683,14 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 31
; GFX11-NEXT: v_alignbit_b32 v2, s6, s10, 23
; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, 25
; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, 31
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index a5ea1ee..860fe74 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -196,7 +196,7 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -204,8 +204,8 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_alignbit_b32 v0, s4, v2, v0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: v_mov_b32_e32 v3, s9
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -259,14 +259,14 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, v0
; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, v2
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[8:9]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -294,14 +294,14 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; VI-LABEL: fshr_v2i32_imm:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_alignbit_b32 v1, s5, v0, 9
; VI-NEXT: v_alignbit_b32 v0, s4, v2, 7
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -348,12 +348,12 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 9
; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, 7
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -391,7 +391,7 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: v_mov_b32_e32 v1, s15
@@ -405,8 +405,8 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v4, s12
; VI-NEXT: v_alignbit_b32 v0, s4, v0, v4
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -414,8 +414,8 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s11
; GFX9-NEXT: v_mov_b32_e32 v1, s15
@@ -429,7 +429,7 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v5, s12
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v5
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshr_v4i32:
@@ -474,7 +474,7 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x54
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s15 :: v_dual_mov_b32 v1, s14
@@ -485,7 +485,7 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, v4
; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, v5
-; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v6, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -517,7 +517,7 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; VI-LABEL: fshr_v4i32_imm:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: v_mov_b32_e32 v1, s10
@@ -526,17 +526,17 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; VI-NEXT: v_alignbit_b32 v2, s6, v1, 9
; VI-NEXT: v_alignbit_b32 v1, s5, v4, 7
; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fshr_v4i32_imm:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s11
; GFX9-NEXT: v_mov_b32_e32 v1, s10
@@ -546,7 +546,7 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 7
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshr_v4i32_imm:
@@ -583,14 +583,14 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 1
; GFX11-NEXT: v_alignbit_b32 v2, s6, s10, 9
; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, 7
; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, 1
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
index f72d4e0..6de84a6 100644
--- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
@@ -58,24 +58,24 @@ define amdgpu_kernel void @fsub_f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_sub_f16_e32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -113,38 +113,38 @@ define amdgpu_kernel void @fsub_f16_imm_a(
;
; GFX89-LABEL: fsub_f16_imm_a:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: v_sub_f16_e32 v0, 1.0, v0
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fsub_f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_sub_f16_e32 v0, 1.0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -180,38 +180,38 @@ define amdgpu_kernel void @fsub_f16_imm_b(
;
; GFX89-LABEL: fsub_f16_imm_b:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: v_add_f16_e32 v0, -2.0, v0
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fsub_f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_add_f16_e32 v0, -2.0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -309,23 +309,23 @@ define amdgpu_kernel void @fsub_v2f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
-; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: buffer_load_b32 v1, off, s[8:11], 0
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -369,60 +369,60 @@ define amdgpu_kernel void @fsub_v2f16_imm_a(
;
; VI-LABEL: fsub_v2f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; VI-NEXT: v_mov_b32_e32 v1, 0x4000
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_sub_f16_e32 v0, 1.0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fsub_v2f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s0, 0x40003c00
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s4, 0x40003c00
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_add_f16 v0, v0, s0 neg_lo:[1,0] neg_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fsub_v2f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, v0 neg_lo:[0,1] neg_hi:[0,1]
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -464,60 +464,60 @@ define amdgpu_kernel void @fsub_v2f16_imm_b(
;
; VI-LABEL: fsub_v2f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; VI-NEXT: v_mov_b32_e32 v1, 0xbc00
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v0, -2.0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fsub_v2f16_imm_b:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s0, 0xbc00c000
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s4, 0xbc00c000
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_add_f16 v0, v0, s0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: v_pk_add_f16 v0, v0, s4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fsub_v2f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v0, 0xbc00c000, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
index 1853aa9..6d868e84 100644
--- a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
+++ b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
@@ -4,14 +4,14 @@
define amdgpu_kernel void @divergent_or3_b32(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_or3_b32:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
+; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_or3_b32 v0, v1, v0, v2
; GCN-NEXT: v_not_b32_e32 v0, v0
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -31,17 +31,17 @@ bb:
define amdgpu_kernel void @divergent_or3_b64(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_or3_b64:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
-; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1]
+; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[2:3] offset:16
+; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_or3_b32 v1, v3, v1, v5
; GCN-NEXT: v_or3_b32 v0, v2, v0, v4
; GCN-NEXT: v_not_b32_e32 v1, v1
; GCN-NEXT: v_not_b32_e32 v0, v0
-; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
+; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3]
; GCN-NEXT: s_endpgm
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -61,15 +61,15 @@ bb:
define amdgpu_kernel void @divergent_and3_b32(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_and3_b32:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
+; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, v1, v0
; GCN-NEXT: v_and_b32_e32 v0, v0, v2
; GCN-NEXT: v_not_b32_e32 v0, v0
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -89,11 +89,11 @@ bb:
define amdgpu_kernel void @divergent_and3_b64(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_and3_b64:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1]
-; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
+; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[2:3]
+; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[2:3] offset:16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_and_b32_e32 v1, v3, v1
; GCN-NEXT: v_and_b32_e32 v0, v2, v0
@@ -102,7 +102,7 @@ define amdgpu_kernel void @divergent_and3_b64(ptr addrspace(1) %arg) {
; GCN-NEXT: v_and_b32_e32 v0, v0, v4
; GCN-NEXT: v_not_b32_e32 v1, v1
; GCN-NEXT: v_not_b32_e32 v0, v0
-; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
+; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3]
; GCN-NEXT: s_endpgm
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -122,14 +122,14 @@ bb:
define amdgpu_kernel void @divergent_xor3_b32(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_xor3_b32:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
+; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_xor_b32_e32 v0, v1, v0
; GCN-NEXT: v_xnor_b32_e32 v0, v0, v2
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -149,18 +149,18 @@ bb:
define amdgpu_kernel void @divergent_xor3_b64(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_xor3_b64:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1]
-; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
+; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[2:3]
+; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[2:3] offset:16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_xor_b32_e32 v1, v3, v1
; GCN-NEXT: v_xor_b32_e32 v0, v2, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_xnor_b32_e32 v1, v1, v5
; GCN-NEXT: v_xnor_b32_e32 v0, v0, v4
-; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
+; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3]
; GCN-NEXT: s_endpgm
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll
index 0612383..98bb405 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=gfx803 -d - | FileCheck -check-prefix=DISASSEMBLY-VI %s
@@ -6,7 +7,7 @@
; FIXME: This will still fail for gfx6/7 and gfx10 subtargets.
; DISASSEMBLY-VI: .long 0xdd348000 // {{[0-9A-Z]+}}: DD348000
-; DISASSEMBLY-VI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc // {{[0-9A-Z]+}}: 00000100
+; DISASSEMBLY-VI-NEXT: v_cndmask_b32_e32 v2, v0, v0, vcc // {{[0-9A-Z]+}}: 00040100
define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #0 {
; GCN-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget:
@@ -18,13 +19,13 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addr
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB0_2
; GCN-NEXT: ; %bb.1:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s2
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mul_f32_e32 v1, 4.0, v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
+; GCN-NEXT: global_atomic_add_f32 v0, v1, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_wbinvl1_vol
; GCN-NEXT: .LBB0_2:
diff --git a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
index b8ecbae..d3dc660 100644
--- a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
@@ -135,10 +135,10 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add
; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX908-NEXT: v_mov_b32_e32 v2, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX908-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v0, s0
-; GFX908-NEXT: v_mov_b32_e32 v1, s1
+; GFX908-NEXT: v_mov_b32_e32 v0, s4
+; GFX908-NEXT: v_mov_b32_e32 v1, s5
; GFX908-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX908-NEXT: s_endpgm
;
@@ -147,9 +147,9 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX90A-NEXT: s_endpgm
;
@@ -158,10 +158,10 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add
; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX1030-NEXT: v_mov_b32_e32 v2, 0
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v0, s0
-; GFX1030-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030-NEXT: v_mov_b32_e32 v0, s4
+; GFX1030-NEXT: v_mov_b32_e32 v1, s5
; GFX1030-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1030-NEXT: s_endpgm
%gep0 = getelementptr half, ptr addrspace(1) %0, i64 0
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
index f709eae..41327f9 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
@@ -13,15 +13,15 @@
define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: test_move_load_address_to_vgpr:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dword v0, v1, s[0:1] glc
+; GCN-NEXT: global_load_dword v0, v1, s[2:3] glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: v_add_u32_e32 v2, 0xffffff00, v0
; GCN-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; GCN-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GCN-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
; GCN-NEXT: .LBB0_1: ; %bb3
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -54,21 +54,21 @@ bb3: ; preds = %bb3, %bb
define amdgpu_kernel void @test_move_load_address_to_vgpr_d16_hi(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: test_move_load_address_to_vgpr_d16_hi:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_movk_i32 s0, 0x100
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_ushort v0, v1, s[0:1] glc
+; GCN-NEXT: global_load_ushort v0, v1, s[2:3] glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: s_movk_i32 s1, 0x100
+; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: .LBB1_1: ; %bb3
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_lshlrev_b64 v[3:4], 1, v[0:1]
-; GCN-NEXT: v_add_co_u32_e32 v3, vcc, s0, v3
+; GCN-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3
; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v2, v4, vcc
; GCN-NEXT: global_load_short_d16_hi v0, v[3:4], off glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s1, v0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
; GCN-NEXT: s_cbranch_vccz .LBB1_1
; GCN-NEXT: ; %bb.2: ; %bb2
; GCN-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index 4d585cf..7653cd0 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -19,13 +19,13 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_add_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_add v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -112,14 +112,14 @@ define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in
;
; VI-LABEL: atomic_add_i32_soffset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_mov_b32 s5, 0x8ca0
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s0, 0x8ca0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_add v0, off, s[0:3], s5
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_add v0, off, s[4:7], s0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -394,13 +394,13 @@ define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_add_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_add v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_add v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -616,13 +616,13 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_and_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_and v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -846,13 +846,13 @@ define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_and_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_and v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_and v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -1068,13 +1068,13 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_sub_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_sub v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -1298,13 +1298,13 @@ define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_sub_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_sub v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -1520,13 +1520,13 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_max_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_smax v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_smax v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -1736,13 +1736,13 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_max_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_smax v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_smax v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i32:
@@ -1940,13 +1940,13 @@ define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in
;
; VI-LABEL: atomic_umax_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_umax v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_umax v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i32_offset:
@@ -2152,13 +2152,13 @@ define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_umax_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_umax v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_umax v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i32:
@@ -2356,13 +2356,13 @@ define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_min_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_smin v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_smin v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i32_offset:
@@ -2568,13 +2568,13 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_min_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_smin v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_smin v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i32:
@@ -2772,13 +2772,13 @@ define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in
;
; VI-LABEL: atomic_umin_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_umin v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_umin v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umin_i32_offset:
@@ -2984,13 +2984,13 @@ define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_umin_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_umin v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_umin v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umin_i32:
@@ -3190,13 +3190,13 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_or_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_or v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -3420,13 +3420,13 @@ define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_or_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_or v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_or v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -3642,13 +3642,13 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in
;
; VI-LABEL: atomic_xchg_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -3686,13 +3686,13 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float %
;
; VI-LABEL: atomic_xchg_f32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -3916,13 +3916,13 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_xchg_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -4632,13 +4632,13 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_xor_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_xor v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -4862,13 +4862,13 @@ define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_xor_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_xor v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -5087,31 +5087,31 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: atomic_load_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 16
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s4, 16
+; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i32_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %in, i64 4
@@ -5141,31 +5141,31 @@ define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr a
;
; VI-LABEL: atomic_load_i32_negoffset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 0xfffffe00
-; VI-NEXT: s_addc_u32 s1, s1, -1
+; VI-NEXT: s_add_u32 s0, s4, 0xfffffe00
+; VI-NEXT: s_addc_u32 s1, s5, -1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i32_negoffset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:-512 glc
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5] offset:-512 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %in, i64 -128
@@ -5193,31 +5193,31 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: atomic_load_f32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 16
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s4, 16
+; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_f32_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%gep = getelementptr float, ptr addrspace(1) %in, i64 4
@@ -5245,29 +5245,29 @@ define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1
;
; VI-LABEL: atomic_load_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i32:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent") seq_cst, align 4
@@ -5298,14 +5298,12 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, p
;
; VI-LABEL: atomic_load_i32_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 16
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -5313,9 +5311,11 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, p
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i32_addr64_offset:
@@ -5363,22 +5363,22 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: atomic_load_i32_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i32_addr64:
@@ -5425,14 +5425,12 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, p
;
; VI-LABEL: atomic_load_f32_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 16
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -5440,9 +5438,11 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, p
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_f32_addr64_offset:
@@ -5796,29 +5796,29 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrs
;
; VI-LABEL: atomic_load_i8_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
-; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:16 glc
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:16 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i8_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT: global_load_ubyte v1, v0, s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr addrspace(1) %in, i64 16
@@ -5848,31 +5848,31 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr ad
;
; VI-LABEL: atomic_load_i8_negoffset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 0xfffffe00
-; VI-NEXT: s_addc_u32 s1, s1, -1
+; VI-NEXT: s_add_u32 s0, s4, 0xfffffe00
+; VI-NEXT: s_addc_u32 s1, s5, -1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_ubyte v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i8_negoffset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:-512 glc
+; GFX9-NEXT: global_load_ubyte v1, v0, s[4:5] offset:-512 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr addrspace(1) %in, i64 -512
@@ -5977,29 +5977,29 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: atomic_load_i16_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:16 glc
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 offset:16 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i16_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_short v0, v1, s[2:3]
+; GFX9-NEXT: global_store_short v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%gep = getelementptr i16, ptr addrspace(1) %in, i64 8
@@ -6029,31 +6029,31 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a
;
; VI-LABEL: atomic_load_i16_negoffset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 0xfffffe00
-; VI-NEXT: s_addc_u32 s1, s1, -1
+; VI-NEXT: s_add_u32 s0, s4, 0xfffffe00
+; VI-NEXT: s_addc_u32 s1, s5, -1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_ushort v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i16_negoffset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:-512 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_short v0, v1, s[2:3]
+; GFX9-NEXT: global_store_short v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%gep = getelementptr i16, ptr addrspace(1) %in, i64 -256
@@ -6307,13 +6307,13 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_inc_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_inc v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -6400,14 +6400,14 @@ define amdgpu_kernel void @atomic_inc_i32_soffset(ptr addrspace(1) %out, i32 %in
;
; VI-LABEL: atomic_inc_i32_soffset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_mov_b32 s5, 0x8ca0
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s0, 0x8ca0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_inc v0, off, s[0:3], s5
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_inc v0, off, s[4:7], s0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -6681,13 +6681,13 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_dec_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_dec v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -6774,14 +6774,14 @@ define amdgpu_kernel void @atomic_dec_i32_soffset(ptr addrspace(1) %out, i32 %in
;
; VI-LABEL: atomic_dec_i32_soffset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_mov_b32 s5, 0x8ca0
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s0, 0x8ca0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_dec v0, off, s[0:3], s5
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_dec v0, off, s[4:7], s0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -7058,29 +7058,29 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: atomic_load_f16_offset:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:16 glc
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 offset:16 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_f16_offset:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_short v0, v1, s[2:3]
+; GFX9-NEXT: global_store_short v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
%gep = getelementptr half, ptr addrspace(1) %in, i64 8
%val = load atomic half, ptr addrspace(1) %gep seq_cst, align 2
@@ -7109,31 +7109,31 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a
;
; VI-LABEL: atomic_load_f16_negoffset:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 0xfffffe00
-; VI-NEXT: s_addc_u32 s1, s1, -1
+; VI-NEXT: s_add_u32 s0, s4, 0xfffffe00
+; VI-NEXT: s_addc_u32 s1, s5, -1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_ushort v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_f16_negoffset:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:-512 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_short v0, v1, s[2:3]
+; GFX9-NEXT: global_store_short v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
%gep = getelementptr half, ptr addrspace(1) %in, i64 -256
%val = load atomic half, ptr addrspace(1) %gep seq_cst, align 2
@@ -7160,29 +7160,29 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add
;
; VI-LABEL: atomic_load_bf16_offset:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:16 glc
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 offset:16 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_bf16_offset:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_short v0, v1, s[2:3]
+; GFX9-NEXT: global_store_short v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
%gep = getelementptr bfloat, ptr addrspace(1) %in, i64 8
%val = load atomic bfloat, ptr addrspace(1) %gep seq_cst, align 2
@@ -7211,31 +7211,31 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr
;
; VI-LABEL: atomic_load_bf16_negoffset:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 0xfffffe00
-; VI-NEXT: s_addc_u32 s1, s1, -1
+; VI-NEXT: s_add_u32 s0, s4, 0xfffffe00
+; VI-NEXT: s_addc_u32 s1, s5, -1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_ushort v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_bf16_negoffset:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:-512 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_short v0, v1, s[2:3]
+; GFX9-NEXT: global_store_short v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
%gep = getelementptr bfloat, ptr addrspace(1) %in, i64 -256
%val = load atomic bfloat, ptr addrspace(1) %gep seq_cst, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index 3050da03..b8031c6 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -4753,26 +4753,26 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_max_i32_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s7, s5, 31
-; VI-NEXT: s_mov_b32 s6, s5
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dword s5, s[6:7], 0x10
-; VI-NEXT: s_add_u32 s6, s6, 16
-; VI-NEXT: s_addc_u32 s7, s7, 0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_ashr_i32 s1, s3, 31
+; VI-NEXT: s_mov_b32 s0, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; VI-NEXT: s_add_u32 s4, s4, s0
+; VI-NEXT: s_addc_u32 s5, s5, s1
+; VI-NEXT: s_load_dword s3, s[4:5], 0x10
+; VI-NEXT: s_add_u32 s4, s4, 16
+; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s5
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: .LBB92_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v3, v2
-; VI-NEXT: v_max_i32_e32 v2, s4, v3
+; VI-NEXT: v_max_i32_e32 v2, s2, v3
; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -4782,8 +4782,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
; VI-NEXT: s_cbranch_execnz .LBB92_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -4963,24 +4963,24 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_max_i32_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s7, s5, 31
-; VI-NEXT: s_mov_b32 s6, s5
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dword s5, s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_ashr_i32 s1, s3, 31
+; VI-NEXT: s_mov_b32 s0, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; VI-NEXT: s_add_u32 s4, s4, s0
+; VI-NEXT: s_addc_u32 s5, s5, s1
+; VI-NEXT: s_load_dword s3, s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: .LBB94_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v3, v2
-; VI-NEXT: v_max_i32_e32 v2, s4, v3
+; VI-NEXT: v_max_i32_e32 v2, s2, v3
; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -4990,8 +4990,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_cbranch_execnz .LBB94_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -6006,26 +6006,26 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
;
; VI-LABEL: atomic_umax_i32_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s7, s5, 31
-; VI-NEXT: s_mov_b32 s6, s5
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dword s5, s[6:7], 0x10
-; VI-NEXT: s_add_u32 s6, s6, 16
-; VI-NEXT: s_addc_u32 s7, s7, 0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_ashr_i32 s1, s3, 31
+; VI-NEXT: s_mov_b32 s0, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; VI-NEXT: s_add_u32 s4, s4, s0
+; VI-NEXT: s_addc_u32 s5, s5, s1
+; VI-NEXT: s_load_dword s3, s[4:5], 0x10
+; VI-NEXT: s_add_u32 s4, s4, 16
+; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s5
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: .LBB106_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v3, v2
-; VI-NEXT: v_max_u32_e32 v2, s4, v3
+; VI-NEXT: v_max_u32_e32 v2, s2, v3
; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -6035,8 +6035,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
; VI-NEXT: s_cbranch_execnz .LBB106_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -6121,24 +6121,24 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_umax_i32_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s7, s5, 31
-; VI-NEXT: s_mov_b32 s6, s5
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dword s5, s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_ashr_i32 s1, s3, 31
+; VI-NEXT: s_mov_b32 s0, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; VI-NEXT: s_add_u32 s4, s4, s0
+; VI-NEXT: s_addc_u32 s5, s5, s1
+; VI-NEXT: s_load_dword s3, s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: .LBB107_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v3, v2
-; VI-NEXT: v_max_u32_e32 v2, s4, v3
+; VI-NEXT: v_max_u32_e32 v2, s2, v3
; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -6148,8 +6148,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_cbranch_execnz .LBB107_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -7997,26 +7997,26 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_min_i32_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s7, s5, 31
-; VI-NEXT: s_mov_b32 s6, s5
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dword s5, s[6:7], 0x10
-; VI-NEXT: s_add_u32 s6, s6, 16
-; VI-NEXT: s_addc_u32 s7, s7, 0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_ashr_i32 s1, s3, 31
+; VI-NEXT: s_mov_b32 s0, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; VI-NEXT: s_add_u32 s4, s4, s0
+; VI-NEXT: s_addc_u32 s5, s5, s1
+; VI-NEXT: s_load_dword s3, s[4:5], 0x10
+; VI-NEXT: s_add_u32 s4, s4, 16
+; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s5
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: .LBB129_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v3, v2
-; VI-NEXT: v_min_i32_e32 v2, s4, v3
+; VI-NEXT: v_min_i32_e32 v2, s2, v3
; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -8026,8 +8026,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
; VI-NEXT: s_cbranch_execnz .LBB129_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -8194,24 +8194,24 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_min_i32_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s7, s5, 31
-; VI-NEXT: s_mov_b32 s6, s5
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dword s5, s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_ashr_i32 s1, s3, 31
+; VI-NEXT: s_mov_b32 s0, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; VI-NEXT: s_add_u32 s4, s4, s0
+; VI-NEXT: s_addc_u32 s5, s5, s1
+; VI-NEXT: s_load_dword s3, s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: .LBB131_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v3, v2
-; VI-NEXT: v_min_i32_e32 v2, s4, v3
+; VI-NEXT: v_min_i32_e32 v2, s2, v3
; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -8221,8 +8221,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_cbranch_execnz .LBB131_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index f5dbaaf..a6c8f66 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -20,36 +20,36 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_add_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_add_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -115,15 +115,15 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_add_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -155,10 +155,10 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-LABEL: atomic_add_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -191,11 +191,11 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32
@@ -233,56 +233,56 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_add_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_add_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -312,38 +312,38 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_add_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_add_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -408,15 +408,15 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-LABEL: atomic_add_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -447,10 +447,10 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in,
; VI-LABEL: atomic_add_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -481,11 +481,11 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1]
@@ -522,54 +522,54 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_add_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_add_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -596,36 +596,36 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_and_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_and_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -691,15 +691,15 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_and_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -731,10 +731,10 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-LABEL: atomic_and_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -767,11 +767,11 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32
@@ -809,56 +809,56 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_and_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_and_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -888,38 +888,38 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_and_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_and_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -984,15 +984,15 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-LABEL: atomic_and_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1023,10 +1023,10 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in,
; VI-LABEL: atomic_and_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -1057,11 +1057,11 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1]
@@ -1098,54 +1098,54 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_and_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_and_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1172,36 +1172,36 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_sub_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_sub_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -1267,15 +1267,15 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_sub_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1307,10 +1307,10 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-LABEL: atomic_sub_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -1343,11 +1343,11 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32
@@ -1385,56 +1385,56 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_sub_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_sub_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1464,38 +1464,38 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_sub_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_sub_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -1560,15 +1560,15 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-LABEL: atomic_sub_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1599,10 +1599,10 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in,
; VI-LABEL: atomic_sub_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -1633,11 +1633,11 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1]
@@ -1674,54 +1674,54 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_sub_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_sub_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1746,32 +1746,32 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_max_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
@@ -1834,15 +1834,15 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_max_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1872,10 +1872,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-LABEL: atomic_max_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -1904,11 +1904,11 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32
@@ -1945,54 +1945,54 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_max_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2020,34 +2020,34 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_max_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
@@ -2109,15 +2109,15 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-LABEL: atomic_max_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2146,10 +2146,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in,
; VI-LABEL: atomic_max_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -2176,11 +2176,11 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1]
@@ -2216,52 +2216,52 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_max_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2286,32 +2286,32 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in
;
; VI-LABEL: atomic_umax_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
@@ -2374,15 +2374,15 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_umax_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2412,10 +2412,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
; VI-LABEL: atomic_umax_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -2444,11 +2444,11 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32
@@ -2485,54 +2485,54 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
;
; VI-LABEL: atomic_umax_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2560,34 +2560,34 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_umax_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
@@ -2649,15 +2649,15 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp
; GFX12-LABEL: atomic_umax_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2686,10 +2686,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in
; VI-LABEL: atomic_umax_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -2716,11 +2716,11 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1]
@@ -2756,52 +2756,52 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_umax_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2826,32 +2826,32 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_min_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
@@ -2914,15 +2914,15 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_min_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2952,10 +2952,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-LABEL: atomic_min_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -2984,11 +2984,11 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32
@@ -3025,54 +3025,54 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_min_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3100,34 +3100,34 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_min_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
@@ -3189,15 +3189,15 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-LABEL: atomic_min_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3226,10 +3226,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in,
; VI-LABEL: atomic_min_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -3256,11 +3256,11 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1]
@@ -3296,52 +3296,52 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_min_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3366,32 +3366,32 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in
;
; VI-LABEL: atomic_umin_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umin_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
@@ -3454,15 +3454,15 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_umin_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3492,10 +3492,10 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out,
; VI-LABEL: atomic_umin_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -3524,11 +3524,11 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32
@@ -3565,54 +3565,54 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o
;
; VI-LABEL: atomic_umin_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umin_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3640,34 +3640,34 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_umin_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umin_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
@@ -3729,15 +3729,15 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp
; GFX12-LABEL: atomic_umin_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3766,10 +3766,10 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in
; VI-LABEL: atomic_umin_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -3796,11 +3796,11 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1]
@@ -3836,52 +3836,52 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_umin_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umin_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3908,36 +3908,36 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_or_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_or_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -4003,15 +4003,15 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a
; GFX12-LABEL: atomic_or_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4043,10 +4043,10 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6
; VI-LABEL: atomic_or_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -4079,11 +4079,11 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32
@@ -4121,56 +4121,56 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out
;
; VI-LABEL: atomic_or_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_or_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4200,38 +4200,38 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_or_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_or_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -4296,15 +4296,15 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac
; GFX12-LABEL: atomic_or_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4335,10 +4335,10 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in,
; VI-LABEL: atomic_or_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -4369,11 +4369,11 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1]
@@ -4410,54 +4410,54 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: atomic_or_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_or_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4484,36 +4484,36 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in
;
; VI-LABEL: atomic_xchg_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xchg_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -4539,36 +4539,36 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double
;
; VI-LABEL: atomic_xchg_f64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xchg_f64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_f64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -4594,36 +4594,36 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_xchg_pointer_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xchg_pointer_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_pointer_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -4689,15 +4689,15 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_xchg_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4729,10 +4729,10 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out,
; VI-LABEL: atomic_xchg_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -4765,11 +4765,11 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32
@@ -4807,56 +4807,56 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o
;
; VI-LABEL: atomic_xchg_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xchg_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4886,38 +4886,38 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_xchg_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xchg_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -4982,15 +4982,15 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp
; GFX12-LABEL: atomic_xchg_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5021,10 +5021,10 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in
; VI-LABEL: atomic_xchg_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -5055,11 +5055,11 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1]
@@ -5096,54 +5096,54 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_xchg_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xchg_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5170,36 +5170,36 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_xor_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xor_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -5265,15 +5265,15 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_xor_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5305,10 +5305,10 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-LABEL: atomic_xor_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -5341,11 +5341,11 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32
@@ -5383,56 +5383,56 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_xor_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xor_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5462,38 +5462,38 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_xor_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xor_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -5558,15 +5558,15 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-LABEL: atomic_xor_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5597,10 +5597,10 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in,
; VI-LABEL: atomic_xor_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -5631,11 +5631,11 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1]
@@ -5672,54 +5672,54 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_xor_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xor_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5786,11 +5786,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr addrspace(1) %out, i64
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5859,11 +5859,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr addrspace(1) %out, i64
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:72000
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5897,50 +5897,50 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out,
;
; VI-LABEL: atomic_cmpxchg_i64_ret_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
-; VI-NEXT: s_mov_b32 s2, s10
-; VI-NEXT: s_mov_b32 s3, s11
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 offset:32 glc
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_cmpxchg_i64_ret_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v2, s10
+; GFX9-NEXT: v_mov_b32_e32 v3, s11
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s11
+; GFX12-NEXT: v_mov_b32_e32 v2, s10
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v4, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v4, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5974,18 +5974,18 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_cmpxchg_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -5994,16 +5994,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou
;
; GFX9-LABEL: atomic_cmpxchg_i64_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v2, s10
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: v_mov_b32_e32 v3, s11
; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[0:1] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -6011,14 +6011,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou
;
; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s11
+; GFX12-NEXT: v_mov_b32_e32 v2, s10
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6058,19 +6058,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
; VI-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: s_add_u32 s0, s4, s2
-; VI-NEXT: s_addc_u32 s3, s5, s3
-; VI-NEXT: s_add_u32 s2, s0, 32
-; VI-NEXT: s_addc_u32 s3, s3, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
+; VI-NEXT: s_add_u32 s0, s0, 32
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -6084,17 +6084,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
; GFX9-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
-; GFX9-NEXT: s_add_u32 s2, s4, s2
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-NEXT: s_addc_u32 s3, s5, s3
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] offset:32 glc
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7]
@@ -6104,11 +6104,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x44
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
@@ -6184,11 +6184,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr addrspace(1) %out, i64 %in, i6
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6221,50 +6221,50 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add
;
; VI-LABEL: atomic_cmpxchg_i64_ret:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 glc
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_cmpxchg_i64_ret:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v2, s10
+; GFX9-NEXT: v_mov_b32_e32 v3, s11
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpxchg_i64_ret:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s11
+; GFX12-NEXT: v_mov_b32_e32 v2, s10
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v4, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v4, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6297,16 +6297,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64
;
; VI-LABEL: atomic_cmpxchg_i64_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -6315,16 +6315,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64
;
; GFX9-LABEL: atomic_cmpxchg_i64_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v2, s10
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: v_mov_b32_e32 v3, s11
; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -6332,14 +6332,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64
;
; GFX12-LABEL: atomic_cmpxchg_i64_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s11
+; GFX12-NEXT: v_mov_b32_e32 v2, s10
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6378,17 +6378,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
; VI-LABEL: atomic_cmpxchg_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
-; VI-NEXT: s_add_u32 s2, s4, s2
-; VI-NEXT: s_addc_u32 s3, s5, s3
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -6402,17 +6402,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
; GFX9-LABEL: atomic_cmpxchg_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
-; GFX9-NEXT: s_add_u32 s2, s4, s2
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-NEXT: s_addc_u32 s3, s5, s3
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7]
@@ -6422,11 +6422,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x44
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
@@ -6464,42 +6464,42 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: atomic_load_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s4, 32
+; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:32 glc
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[4:5] offset:32 th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6531,42 +6531,42 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr
;
; VI-LABEL: atomic_load_i64_neg_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 0xffffffe0
-; VI-NEXT: s_addc_u32 s1, s1, -1
+; VI-NEXT: s_add_u32 s0, s4, 0xffffffe0
+; VI-NEXT: s_addc_u32 s1, s5, -1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i64_neg_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:-32 glc
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:-32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64_neg_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:-32 th:TH_LOAD_NT
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[4:5] offset:-32 th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6596,40 +6596,40 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1
;
; VI-LABEL: atomic_load_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[4:5] th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6662,14 +6662,12 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p
;
; VI-LABEL: atomic_load_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -6677,9 +6675,11 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i64_addr64_offset:
@@ -6700,17 +6700,17 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p
; GFX12-LABEL: atomic_load_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6745,22 +6745,22 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: atomic_load_i64_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i64_addr64:
@@ -6781,17 +6781,17 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr
; GFX12-LABEL: atomic_load_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6825,14 +6825,12 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p
;
; VI-LABEL: atomic_load_f64_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -6840,9 +6838,11 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_f64_addr64_offset:
@@ -6863,17 +6863,17 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p
; GFX12-LABEL: atomic_load_f64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6901,34 +6901,34 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou
;
; VI-LABEL: atomic_store_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_u32 s0, s2, 32
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: s_add_u32 s0, s6, 32
+; VI-NEXT: s_addc_u32 s1, s7, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_store_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7] offset:32
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6954,32 +6954,32 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) {
;
; VI-LABEL: atomic_store_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_store_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -7008,10 +7008,10 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace
; VI-LABEL: atomic_store_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s6, s0
; VI-NEXT: s_addc_u32 s1, s7, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -7040,11 +7040,11 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] offset:32
@@ -7078,10 +7078,10 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %ou
; VI-LABEL: atomic_store_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s6, s0
; VI-NEXT: s_addc_u32 s1, s7, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -7108,11 +7108,11 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %ou
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -7145,10 +7145,10 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp
; VI-LABEL: atomic_store_f64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s6, s0
; VI-NEXT: s_addc_u32 s1, s7, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -7177,11 +7177,11 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] offset:32
@@ -7211,36 +7211,36 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_inc_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_inc_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -7306,15 +7306,15 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_inc_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -7346,10 +7346,10 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i
; VI-LABEL: atomic_inc_i64_incr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -7382,11 +7382,11 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32
@@ -7416,36 +7416,36 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_dec_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_dec_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_dec_u64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -7511,15 +7511,15 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_dec_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -7551,10 +7551,10 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i
; VI-LABEL: atomic_dec_i64_decr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -7587,11 +7587,11 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
index cafd35a..200aa19 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
@@ -4905,26 +4905,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
;
; VI-LABEL: atomic_max_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s6
-; VI-NEXT: s_addc_u32 s1, s1, s7
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s3
+; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], 3
+; VI-NEXT: s_add_u32 s2, s4, s2
+; VI-NEXT: s_addc_u32 s3, s5, s3
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20
+; VI-NEXT: s_add_u32 s2, s2, 32
+; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v6, s7
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v7, s2
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v7, s6
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: .LBB88_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -4932,9 +4932,9 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB88_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_endpgm
@@ -5025,76 +5025,76 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_max_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[8:9], 0
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s6
-; VI-NEXT: s_addc_u32 s1, s1, s7
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v4, s5
+; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
+; VI-NEXT: s_add_u32 s2, s4, s2
+; VI-NEXT: s_addc_u32 s3, s5, s3
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20
+; VI-NEXT: s_add_u32 s2, s2, 32
+; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v4, s9
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v5, s4
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v5, s8
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: .LBB89_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v9, v3
; VI-NEXT: v_mov_b32_e32 v8, v2
-; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
-; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; VI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB89_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[8:9]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_or_b64 exec, exec, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s6
-; GFX9-NEXT: s_addc_u32 s1, s1, s7
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20
-; GFX9-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20
+; GFX9-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-NEXT: v_mov_b32_e32 v3, s8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v8, v1
; GFX9-NEXT: v_mov_b32_e32 v7, v0
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[7:8]
; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB89_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -5146,24 +5146,24 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in,
;
; VI-LABEL: atomic_max_i64_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; VI-NEXT: s_add_u32 s4, s0, s4
-; VI-NEXT: s_addc_u32 s5, s1, s5
-; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; VI-NEXT: s_add_u32 s2, s4, s0
+; VI-NEXT: s_addc_u32 s3, s5, s1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: s_mov_b64 s[0:1], 0
-; VI-NEXT: v_mov_b32_e32 v6, s3
-; VI-NEXT: v_mov_b32_e32 v7, s2
+; VI-NEXT: v_mov_b32_e32 v6, s7
+; VI-NEXT: v_mov_b32_e32 v7, s6
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: .LBB90_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -5263,25 +5263,25 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_max_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s2, s4, s0
+; VI-NEXT: s_addc_u32 s3, s5, s1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_mov_b64 s[0:1], 0
-; VI-NEXT: v_mov_b32_e32 v4, s5
-; VI-NEXT: v_mov_b32_e32 v5, s4
+; VI-NEXT: v_mov_b32_e32 v4, s9
+; VI-NEXT: v_mov_b32_e32 v5, s8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s8
-; VI-NEXT: v_mov_b32_e32 v3, s9
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: .LBB91_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v9, v3
; VI-NEXT: v_mov_b32_e32 v8, v2
-; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -5293,44 +5293,44 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_cbranch_execnz .LBB91_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s6
-; GFX9-NEXT: s_addc_u32 s1, s1, s7
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-NEXT: v_mov_b32_e32 v3, s8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v8, v1
; GFX9-NEXT: v_mov_b32_e32 v7, v0
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[7:8]
; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB91_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -6367,26 +6367,26 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
;
; VI-LABEL: atomic_umax_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s6
-; VI-NEXT: s_addc_u32 s1, s1, s7
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s3
+; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], 3
+; VI-NEXT: s_add_u32 s2, s4, s2
+; VI-NEXT: s_addc_u32 s3, s5, s3
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20
+; VI-NEXT: s_add_u32 s2, s2, 32
+; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v6, s7
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v7, s2
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v7, s6
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: .LBB102_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
+; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -6394,9 +6394,9 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB102_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_endpgm
@@ -6487,76 +6487,76 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
;
; VI-LABEL: atomic_umax_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[8:9], 0
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s6
-; VI-NEXT: s_addc_u32 s1, s1, s7
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v4, s5
+; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
+; VI-NEXT: s_add_u32 s2, s4, s2
+; VI-NEXT: s_addc_u32 s3, s5, s3
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20
+; VI-NEXT: s_add_u32 s2, s2, 32
+; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v4, s9
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v5, s4
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v5, s8
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: .LBB103_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v9, v3
; VI-NEXT: v_mov_b32_e32 v8, v2
-; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
-; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; VI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB103_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[8:9]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_or_b64 exec, exec, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s6
-; GFX9-NEXT: s_addc_u32 s1, s1, s7
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20
-; GFX9-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20
+; GFX9-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-NEXT: v_mov_b32_e32 v3, s8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v8, v1
; GFX9-NEXT: v_mov_b32_e32 v7, v0
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8]
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[7:8]
; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB103_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -6613,25 +6613,25 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_umax_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s2, s4, s0
+; VI-NEXT: s_addc_u32 s3, s5, s1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_mov_b64 s[0:1], 0
-; VI-NEXT: v_mov_b32_e32 v4, s5
-; VI-NEXT: v_mov_b32_e32 v5, s4
+; VI-NEXT: v_mov_b32_e32 v4, s9
+; VI-NEXT: v_mov_b32_e32 v5, s8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s8
-; VI-NEXT: v_mov_b32_e32 v3, s9
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: .LBB104_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v9, v3
; VI-NEXT: v_mov_b32_e32 v8, v2
-; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -6643,44 +6643,44 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_cbranch_execnz .LBB104_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s6
-; GFX9-NEXT: s_addc_u32 s1, s1, s7
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-NEXT: v_mov_b32_e32 v3, s8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v8, v1
; GFX9-NEXT: v_mov_b32_e32 v7, v0
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8]
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[7:8]
; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB104_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -8703,26 +8703,26 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
;
; VI-LABEL: atomic_min_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s6
-; VI-NEXT: s_addc_u32 s1, s1, s7
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s3
+; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], 3
+; VI-NEXT: s_add_u32 s2, s4, s2
+; VI-NEXT: s_addc_u32 s3, s5, s3
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20
+; VI-NEXT: s_add_u32 s2, s2, 32
+; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v6, s7
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v7, s2
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v7, s6
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: .LBB125_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -8730,9 +8730,9 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB125_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_endpgm
@@ -8823,76 +8823,76 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_min_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[8:9], 0
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s6
-; VI-NEXT: s_addc_u32 s1, s1, s7
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v4, s5
+; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
+; VI-NEXT: s_add_u32 s2, s4, s2
+; VI-NEXT: s_addc_u32 s3, s5, s3
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20
+; VI-NEXT: s_add_u32 s2, s2, 32
+; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v4, s9
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v5, s4
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v5, s8
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: .LBB126_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v9, v3
; VI-NEXT: v_mov_b32_e32 v8, v2
-; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
-; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; VI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB126_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[8:9]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_or_b64 exec, exec, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s6
-; GFX9-NEXT: s_addc_u32 s1, s1, s7
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20
-; GFX9-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20
+; GFX9-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-NEXT: v_mov_b32_e32 v3, s8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v8, v1
; GFX9-NEXT: v_mov_b32_e32 v7, v0
-; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[7:8]
+; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[7:8]
; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB126_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -8942,20 +8942,20 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_min_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s3
-; VI-NEXT: v_mov_b32_e32 v7, s2
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v6, s7
+; VI-NEXT: v_mov_b32_e32 v7, s6
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: .LBB127_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -8963,38 +8963,38 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB127_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v4, s3
-; GFX9-NEXT: v_mov_b32_e32 v5, s2
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v4, s7
+; GFX9-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB127_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
@@ -9050,25 +9050,25 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_min_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s2, s4, s0
+; VI-NEXT: s_addc_u32 s3, s5, s1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_mov_b64 s[0:1], 0
-; VI-NEXT: v_mov_b32_e32 v4, s5
-; VI-NEXT: v_mov_b32_e32 v5, s4
+; VI-NEXT: v_mov_b32_e32 v4, s9
+; VI-NEXT: v_mov_b32_e32 v5, s8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s8
-; VI-NEXT: v_mov_b32_e32 v3, s9
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: .LBB128_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v9, v3
; VI-NEXT: v_mov_b32_e32 v8, v2
-; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -9080,44 +9080,44 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_cbranch_execnz .LBB128_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s6
-; GFX9-NEXT: s_addc_u32 s1, s1, s7
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-NEXT: v_mov_b32_e32 v3, s8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v8, v1
; GFX9-NEXT: v_mov_b32_e32 v7, v0
-; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[7:8]
+; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[7:8]
; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB128_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 6555ceb..9d174be 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -54,95 +54,95 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
;
; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB0_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB0_2
; GFX9-NEXT: .LBB0_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB0_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB0_2
; GFX1064-NEXT: .LBB0_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB0_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB0_2
; GFX1032-NEXT: .LBB0_3:
; GFX1032-NEXT: s_endpgm
@@ -157,14 +157,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB0_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
+; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mul_f32_e32 v0, 4.0, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: global_atomic_add_f32 v1, v0, s[0:1]
+; GFX1164-NEXT: global_atomic_add_f32 v1, v0, s[4:5]
; GFX1164-NEXT: .LBB0_2:
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -179,13 +179,13 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB0_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
+; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s0
; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: global_atomic_add_f32 v1, v0, s[0:1]
+; GFX1132-NEXT: global_atomic_add_f32 v1, v0, s[4:5]
; GFX1132-NEXT: .LBB0_2:
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -193,95 +193,95 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
-; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6
; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX9-DPP-NEXT: .LBB0_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1064-DPP-NEXT: .LBB0_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1032-DPP-NEXT: .LBB0_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -296,14 +296,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
+; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mul_f32_e32 v0, 4.0, v0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: global_atomic_add_f32 v1, v0, s[0:1]
+; GFX1164-DPP-NEXT: global_atomic_add_f32 v1, v0, s[4:5]
; GFX1164-DPP-NEXT: .LBB0_2:
; GFX1164-DPP-NEXT: s_nop 0
; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -318,13 +318,13 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
+; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: global_atomic_add_f32 v1, v0, s[0:1]
+; GFX1132-DPP-NEXT: global_atomic_add_f32 v1, v0, s[4:5]
; GFX1132-DPP-NEXT: .LBB0_2:
; GFX1132-DPP-NEXT: s_nop 0
; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1119,11 +1119,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
@@ -1131,12 +1131,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB2_2
; GFX9-NEXT: .LBB2_3:
; GFX9-NEXT: s_endpgm
@@ -1158,64 +1158,64 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB2_2
; GFX1064-NEXT: .LBB2_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB2_2
; GFX1032-NEXT: .LBB2_3:
; GFX1032-NEXT: s_endpgm
@@ -1238,27 +1238,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB2_2
; GFX1164-NEXT: .LBB2_3:
; GFX1164-NEXT: s_endpgm
@@ -1269,8 +1269,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -1280,25 +1280,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB2_2
; GFX1132-NEXT: .LBB2_3:
; GFX1132-NEXT: s_endpgm
@@ -1323,11 +1323,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
@@ -1335,12 +1335,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX9-DPP-NEXT: .LBB2_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -1362,64 +1362,64 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1064-DPP-NEXT: .LBB2_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1032-DPP-NEXT: .LBB2_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -1442,27 +1442,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1164-DPP-NEXT: .LBB2_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -1473,8 +1473,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -1484,25 +1484,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1132-DPP-NEXT: .LBB2_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -2349,11 +2349,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
@@ -2361,12 +2361,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB4_2
; GFX9-NEXT: .LBB4_3:
; GFX9-NEXT: s_endpgm
@@ -2388,64 +2388,64 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB4_2
; GFX1064-NEXT: .LBB4_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB4_2
; GFX1032-NEXT: .LBB4_3:
; GFX1032-NEXT: s_endpgm
@@ -2468,27 +2468,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB4_2
; GFX1164-NEXT: .LBB4_3:
; GFX1164-NEXT: s_endpgm
@@ -2499,8 +2499,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -2510,25 +2510,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB4_2
; GFX1132-NEXT: .LBB4_3:
; GFX1132-NEXT: s_endpgm
@@ -2553,11 +2553,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
@@ -2565,12 +2565,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX9-DPP-NEXT: .LBB4_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -2592,64 +2592,64 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1064-DPP-NEXT: .LBB4_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1032-DPP-NEXT: .LBB4_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -2672,27 +2672,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1164-DPP-NEXT: .LBB4_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -2703,8 +2703,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -2714,25 +2714,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1132-DPP-NEXT: .LBB4_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -4247,11 +4247,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
@@ -4259,12 +4259,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX9-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB7_2
; GFX9-NEXT: .LBB7_3:
; GFX9-NEXT: s_endpgm
@@ -4286,64 +4286,64 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB7_2
; GFX1064-NEXT: .LBB7_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB7_2
; GFX1032-NEXT: .LBB7_3:
; GFX1032-NEXT: s_endpgm
@@ -4366,27 +4366,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB7_2
; GFX1164-NEXT: .LBB7_3:
; GFX1164-NEXT: s_endpgm
@@ -4397,8 +4397,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -4408,25 +4408,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB7_2
; GFX1132-NEXT: .LBB7_3:
; GFX1132-NEXT: s_endpgm
@@ -4451,11 +4451,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
@@ -4463,12 +4463,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX9-DPP-NEXT: .LBB7_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -4490,64 +4490,64 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX1064-DPP-NEXT: .LBB7_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX1032-DPP-NEXT: .LBB7_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -4570,27 +4570,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX1164-DPP-NEXT: .LBB7_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -4601,8 +4601,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -4612,25 +4612,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX1132-DPP-NEXT: .LBB7_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -5452,101 +5452,101 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
;
; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB9_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB9_2
; GFX9-NEXT: .LBB9_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB9_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB9_2
; GFX1064-NEXT: .LBB9_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s5
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB9_2
; GFX1032-NEXT: .LBB9_3:
; GFX1032-NEXT: s_endpgm
@@ -5562,165 +5562,165 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1164-NEXT: s_cbranch_execz .LBB9_3
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB9_2
; GFX1164-NEXT: .LBB9_3:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB9_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB9_2
; GFX1132-NEXT: .LBB9_3:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX9-DPP-NEXT: .LBB9_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1064-DPP-NEXT: .LBB9_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s5
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1032-DPP-NEXT: .LBB9_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -5736,64 +5736,64 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1164-DPP-NEXT: .LBB9_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1132-DPP-NEXT: .LBB9_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -6349,11 +6349,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -6361,13 +6361,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX9-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB11_2
; GFX9-NEXT: .LBB11_3:
; GFX9-NEXT: s_endpgm
@@ -6387,68 +6387,68 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB11_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB11_2
; GFX1064-NEXT: .LBB11_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB11_2
; GFX1032-NEXT: .LBB11_3:
; GFX1032-NEXT: s_endpgm
@@ -6471,28 +6471,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB11_2
; GFX1164-NEXT: .LBB11_3:
; GFX1164-NEXT: s_endpgm
@@ -6503,8 +6503,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -6514,25 +6514,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB11_2
; GFX1132-NEXT: .LBB11_3:
; GFX1132-NEXT: s_endpgm
@@ -6557,11 +6557,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
@@ -6569,13 +6569,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX9-DPP-NEXT: .LBB11_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -6595,68 +6595,68 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX1064-DPP-NEXT: .LBB11_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX1032-DPP-NEXT: .LBB11_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -6679,28 +6679,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX1164-DPP-NEXT: .LBB11_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -6711,8 +6711,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -6722,25 +6722,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX1132-DPP-NEXT: .LBB11_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -7296,11 +7296,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -7308,13 +7308,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-NEXT: .LBB13_3:
; GFX9-NEXT: s_endpgm
@@ -7334,68 +7334,68 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB13_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-NEXT: .LBB13_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-NEXT: .LBB13_3:
; GFX1032-NEXT: s_endpgm
@@ -7418,28 +7418,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-NEXT: .LBB13_3:
; GFX1164-NEXT: s_endpgm
@@ -7450,8 +7450,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -7461,25 +7461,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-NEXT: .LBB13_3:
; GFX1132-NEXT: s_endpgm
@@ -7504,11 +7504,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
@@ -7516,13 +7516,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-DPP-NEXT: .LBB13_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -7542,68 +7542,68 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-DPP-NEXT: .LBB13_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-DPP-NEXT: .LBB13_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -7626,28 +7626,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-DPP-NEXT: .LBB13_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -7658,8 +7658,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -7669,25 +7669,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-DPP-NEXT: .LBB13_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -8721,11 +8721,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -8733,13 +8733,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB16_2
; GFX9-NEXT: .LBB16_3:
; GFX9-NEXT: s_endpgm
@@ -8759,68 +8759,68 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB16_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB16_2
; GFX1064-NEXT: .LBB16_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB16_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB16_2
; GFX1032-NEXT: .LBB16_3:
; GFX1032-NEXT: s_endpgm
@@ -8843,28 +8843,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB16_2
; GFX1164-NEXT: .LBB16_3:
; GFX1164-NEXT: s_endpgm
@@ -8875,8 +8875,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -8886,25 +8886,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB16_2
; GFX1132-NEXT: .LBB16_3:
; GFX1132-NEXT: s_endpgm
@@ -8929,11 +8929,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
@@ -8941,13 +8941,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX9-DPP-NEXT: .LBB16_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -8967,68 +8967,68 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1064-DPP-NEXT: .LBB16_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1032-DPP-NEXT: .LBB16_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -9051,28 +9051,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1164-DPP-NEXT: .LBB16_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -9083,8 +9083,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -9094,25 +9094,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1132-DPP-NEXT: .LBB16_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -9637,330 +9637,330 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
;
; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB18_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB18_2
; GFX9-NEXT: .LBB18_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB18_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB18_2
; GFX1064-NEXT: .LBB18_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB18_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB18_2
; GFX1032-NEXT: .LBB18_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB18_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1164-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB18_2
; GFX1164-NEXT: .LBB18_3:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-NEXT: s_mov_b32 s5, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB18_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s0
; GFX1132-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB18_2
; GFX1132-NEXT: .LBB18_3:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB18_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
-; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6
; GFX9-DPP-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB18_2
; GFX9-DPP-NEXT: .LBB18_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB18_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB18_2
; GFX1064-DPP-NEXT: .LBB18_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB18_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB18_2
; GFX1032-DPP-NEXT: .LBB18_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB18_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB18_2
; GFX1164-DPP-NEXT: .LBB18_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB18_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s0
; GFX1132-DPP-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB18_2
; GFX1132-DPP-NEXT: .LBB18_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -10007,330 +10007,330 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
;
; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB19_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB19_2
; GFX9-NEXT: .LBB19_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB19_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB19_2
; GFX1064-NEXT: .LBB19_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB19_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB19_2
; GFX1032-NEXT: .LBB19_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB19_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1164-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB19_2
; GFX1164-NEXT: .LBB19_3:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-NEXT: s_mov_b32 s5, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB19_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s0
; GFX1132-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB19_2
; GFX1132-NEXT: .LBB19_3:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB19_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
-; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6
; GFX9-DPP-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB19_2
; GFX9-DPP-NEXT: .LBB19_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB19_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB19_2
; GFX1064-DPP-NEXT: .LBB19_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB19_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB19_2
; GFX1032-DPP-NEXT: .LBB19_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB19_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB19_2
; GFX1164-DPP-NEXT: .LBB19_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB19_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s0
; GFX1132-DPP-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB19_2
; GFX1132-DPP-NEXT: .LBB19_3:
; GFX1132-DPP-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 6548792..fdb36b3 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -57,23 +57,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB0_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB0_2
; GFX9-NEXT: .LBB0_3:
; GFX9-NEXT: s_endpgm
@@ -86,23 +86,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB0_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB0_2
; GFX1064-NEXT: .LBB0_3:
; GFX1064-NEXT: s_endpgm
@@ -110,27 +110,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB0_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB0_2
; GFX1032-NEXT: .LBB0_3:
; GFX1032-NEXT: s_endpgm
@@ -144,25 +144,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB0_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB0_2
; GFX1164-NEXT: .LBB0_3:
; GFX1164-NEXT: s_endpgm
@@ -170,30 +170,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB0_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB0_2
; GFX1132-NEXT: .LBB0_3:
; GFX1132-NEXT: s_endpgm
@@ -206,23 +206,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX9-DPP-NEXT: .LBB0_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -235,23 +235,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1064-DPP-NEXT: .LBB0_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -259,27 +259,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1032-DPP-NEXT: .LBB0_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -293,25 +293,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1164-DPP-NEXT: .LBB0_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -319,30 +319,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1132-DPP-NEXT: .LBB0_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -1235,23 +1235,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB2_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB2_2
; GFX9-NEXT: .LBB2_3:
; GFX9-NEXT: s_endpgm
@@ -1264,23 +1264,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB2_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB2_2
; GFX1064-NEXT: .LBB2_3:
; GFX1064-NEXT: s_endpgm
@@ -1288,27 +1288,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB2_2
; GFX1032-NEXT: .LBB2_3:
; GFX1032-NEXT: s_endpgm
@@ -1322,25 +1322,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB2_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB2_2
; GFX1164-NEXT: .LBB2_3:
; GFX1164-NEXT: s_endpgm
@@ -1348,30 +1348,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB2_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB2_2
; GFX1132-NEXT: .LBB2_3:
; GFX1132-NEXT: s_endpgm
@@ -1384,23 +1384,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX9-DPP-NEXT: .LBB2_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -1413,23 +1413,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1064-DPP-NEXT: .LBB2_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -1437,27 +1437,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1032-DPP-NEXT: .LBB2_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -1471,25 +1471,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1164-DPP-NEXT: .LBB2_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -1497,30 +1497,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1132-DPP-NEXT: .LBB2_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -2415,23 +2415,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB4_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB4_2
; GFX9-NEXT: .LBB4_3:
; GFX9-NEXT: s_endpgm
@@ -2444,23 +2444,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB4_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB4_2
; GFX1064-NEXT: .LBB4_3:
; GFX1064-NEXT: s_endpgm
@@ -2468,27 +2468,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB4_2
; GFX1032-NEXT: .LBB4_3:
; GFX1032-NEXT: s_endpgm
@@ -2502,25 +2502,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB4_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB4_2
; GFX1164-NEXT: .LBB4_3:
; GFX1164-NEXT: s_endpgm
@@ -2528,30 +2528,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB4_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB4_2
; GFX1132-NEXT: .LBB4_3:
; GFX1132-NEXT: s_endpgm
@@ -2564,23 +2564,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX9-DPP-NEXT: .LBB4_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -2593,23 +2593,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1064-DPP-NEXT: .LBB4_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -2617,27 +2617,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1032-DPP-NEXT: .LBB4_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -2651,25 +2651,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1164-DPP-NEXT: .LBB4_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -2677,30 +2677,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1132-DPP-NEXT: .LBB4_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -3597,11 +3597,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB6_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
@@ -3609,13 +3609,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB6_2
; GFX9-NEXT: .LBB6_3:
; GFX9-NEXT: s_endpgm
@@ -3628,25 +3628,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB6_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB6_2
; GFX1064-NEXT: .LBB6_3:
; GFX1064-NEXT: s_endpgm
@@ -3654,29 +3654,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB6_2
; GFX1032-NEXT: .LBB6_3:
; GFX1032-NEXT: s_endpgm
@@ -3690,27 +3690,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB6_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB6_2
; GFX1164-NEXT: .LBB6_3:
; GFX1164-NEXT: s_endpgm
@@ -3718,30 +3718,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB6_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB6_2
; GFX1132-NEXT: .LBB6_3:
; GFX1132-NEXT: s_endpgm
@@ -3754,11 +3754,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
@@ -3766,13 +3766,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX9-DPP-NEXT: .LBB6_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -3785,25 +3785,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1064-DPP-NEXT: .LBB6_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -3811,29 +3811,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1032-DPP-NEXT: .LBB6_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -3847,27 +3847,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1164-DPP-NEXT: .LBB6_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -3875,30 +3875,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1132-DPP-NEXT: .LBB6_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -4456,11 +4456,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB8_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
@@ -4468,13 +4468,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB8_2
; GFX9-NEXT: .LBB8_3:
; GFX9-NEXT: s_endpgm
@@ -4487,25 +4487,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB8_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB8_2
; GFX1064-NEXT: .LBB8_3:
; GFX1064-NEXT: s_endpgm
@@ -4513,29 +4513,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB8_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB8_2
; GFX1032-NEXT: .LBB8_3:
; GFX1032-NEXT: s_endpgm
@@ -4549,27 +4549,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB8_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB8_2
; GFX1164-NEXT: .LBB8_3:
; GFX1164-NEXT: s_endpgm
@@ -4577,30 +4577,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB8_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB8_2
; GFX1132-NEXT: .LBB8_3:
; GFX1132-NEXT: s_endpgm
@@ -4613,11 +4613,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
@@ -4625,13 +4625,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX9-DPP-NEXT: .LBB8_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -4644,25 +4644,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX1064-DPP-NEXT: .LBB8_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -4670,29 +4670,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX1032-DPP-NEXT: .LBB8_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -4706,27 +4706,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX1164-DPP-NEXT: .LBB8_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -4734,30 +4734,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX1132-DPP-NEXT: .LBB8_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -5315,11 +5315,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB10_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
@@ -5327,13 +5327,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB10_2
; GFX9-NEXT: .LBB10_3:
; GFX9-NEXT: s_endpgm
@@ -5346,25 +5346,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB10_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB10_2
; GFX1064-NEXT: .LBB10_3:
; GFX1064-NEXT: s_endpgm
@@ -5372,29 +5372,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB10_2
; GFX1032-NEXT: .LBB10_3:
; GFX1032-NEXT: s_endpgm
@@ -5408,27 +5408,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB10_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB10_2
; GFX1164-NEXT: .LBB10_3:
; GFX1164-NEXT: s_endpgm
@@ -5436,30 +5436,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB10_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB10_2
; GFX1132-NEXT: .LBB10_3:
; GFX1132-NEXT: s_endpgm
@@ -5472,11 +5472,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
@@ -5484,13 +5484,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX9-DPP-NEXT: .LBB10_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -5503,25 +5503,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1064-DPP-NEXT: .LBB10_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -5529,29 +5529,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1032-DPP-NEXT: .LBB10_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -5565,27 +5565,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1164-DPP-NEXT: .LBB10_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -5593,30 +5593,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1132-DPP-NEXT: .LBB10_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -6170,23 +6170,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB12_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB12_2
; GFX9-NEXT: .LBB12_3:
; GFX9-NEXT: s_endpgm
@@ -6199,23 +6199,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB12_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB12_2
; GFX1064-NEXT: .LBB12_3:
; GFX1064-NEXT: s_endpgm
@@ -6223,27 +6223,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB12_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB12_2
; GFX1032-NEXT: .LBB12_3:
; GFX1032-NEXT: s_endpgm
@@ -6257,25 +6257,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB12_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB12_2
; GFX1164-NEXT: .LBB12_3:
; GFX1164-NEXT: s_endpgm
@@ -6283,30 +6283,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB12_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB12_2
; GFX1132-NEXT: .LBB12_3:
; GFX1132-NEXT: s_endpgm
@@ -6319,23 +6319,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX9-DPP-NEXT: .LBB12_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -6348,23 +6348,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX1064-DPP-NEXT: .LBB12_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -6372,27 +6372,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX1032-DPP-NEXT: .LBB12_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -6406,25 +6406,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX1164-DPP-NEXT: .LBB12_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -6432,30 +6432,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX1132-DPP-NEXT: .LBB12_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -6505,23 +6505,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB13_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-NEXT: .LBB13_3:
; GFX9-NEXT: s_endpgm
@@ -6534,23 +6534,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB13_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-NEXT: .LBB13_3:
; GFX1064-NEXT: s_endpgm
@@ -6558,27 +6558,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-NEXT: .LBB13_3:
; GFX1032-NEXT: s_endpgm
@@ -6592,25 +6592,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB13_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-NEXT: .LBB13_3:
; GFX1164-NEXT: s_endpgm
@@ -6618,30 +6618,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB13_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-NEXT: .LBB13_3:
; GFX1132-NEXT: s_endpgm
@@ -6654,23 +6654,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-DPP-NEXT: .LBB13_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -6683,23 +6683,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-DPP-NEXT: .LBB13_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -6707,27 +6707,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-DPP-NEXT: .LBB13_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -6741,25 +6741,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-DPP-NEXT: .LBB13_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -6767,30 +6767,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-DPP-NEXT: .LBB13_3:
; GFX1132-DPP-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 6936cdc..d47a424 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -57,23 +57,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB0_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB0_2
; GFX9-NEXT: .LBB0_3:
; GFX9-NEXT: s_endpgm
@@ -86,23 +86,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB0_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB0_2
; GFX1064-NEXT: .LBB0_3:
; GFX1064-NEXT: s_endpgm
@@ -110,27 +110,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB0_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB0_2
; GFX1032-NEXT: .LBB0_3:
; GFX1032-NEXT: s_endpgm
@@ -144,25 +144,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB0_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB0_2
; GFX1164-NEXT: .LBB0_3:
; GFX1164-NEXT: s_endpgm
@@ -170,30 +170,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB0_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB0_2
; GFX1132-NEXT: .LBB0_3:
; GFX1132-NEXT: s_endpgm
@@ -206,23 +206,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX9-DPP-NEXT: .LBB0_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -235,23 +235,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1064-DPP-NEXT: .LBB0_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -259,27 +259,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1032-DPP-NEXT: .LBB0_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -293,25 +293,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1164-DPP-NEXT: .LBB0_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -319,30 +319,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1132-DPP-NEXT: .LBB0_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -1235,23 +1235,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB2_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB2_2
; GFX9-NEXT: .LBB2_3:
; GFX9-NEXT: s_endpgm
@@ -1264,23 +1264,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB2_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB2_2
; GFX1064-NEXT: .LBB2_3:
; GFX1064-NEXT: s_endpgm
@@ -1288,27 +1288,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB2_2
; GFX1032-NEXT: .LBB2_3:
; GFX1032-NEXT: s_endpgm
@@ -1322,25 +1322,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB2_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB2_2
; GFX1164-NEXT: .LBB2_3:
; GFX1164-NEXT: s_endpgm
@@ -1348,30 +1348,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB2_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB2_2
; GFX1132-NEXT: .LBB2_3:
; GFX1132-NEXT: s_endpgm
@@ -1384,23 +1384,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX9-DPP-NEXT: .LBB2_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -1413,23 +1413,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1064-DPP-NEXT: .LBB2_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -1437,27 +1437,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1032-DPP-NEXT: .LBB2_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -1471,25 +1471,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1164-DPP-NEXT: .LBB2_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -1497,30 +1497,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1132-DPP-NEXT: .LBB2_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -2415,23 +2415,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB4_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB4_2
; GFX9-NEXT: .LBB4_3:
; GFX9-NEXT: s_endpgm
@@ -2444,23 +2444,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB4_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB4_2
; GFX1064-NEXT: .LBB4_3:
; GFX1064-NEXT: s_endpgm
@@ -2468,27 +2468,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB4_2
; GFX1032-NEXT: .LBB4_3:
; GFX1032-NEXT: s_endpgm
@@ -2502,25 +2502,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB4_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB4_2
; GFX1164-NEXT: .LBB4_3:
; GFX1164-NEXT: s_endpgm
@@ -2528,30 +2528,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB4_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB4_2
; GFX1132-NEXT: .LBB4_3:
; GFX1132-NEXT: s_endpgm
@@ -2564,23 +2564,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX9-DPP-NEXT: .LBB4_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -2593,23 +2593,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1064-DPP-NEXT: .LBB4_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -2617,27 +2617,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1032-DPP-NEXT: .LBB4_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -2651,25 +2651,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1164-DPP-NEXT: .LBB4_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -2677,30 +2677,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1132-DPP-NEXT: .LBB4_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -3597,11 +3597,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB6_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
@@ -3609,13 +3609,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB6_2
; GFX9-NEXT: .LBB6_3:
; GFX9-NEXT: s_endpgm
@@ -3628,25 +3628,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB6_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB6_2
; GFX1064-NEXT: .LBB6_3:
; GFX1064-NEXT: s_endpgm
@@ -3654,29 +3654,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB6_2
; GFX1032-NEXT: .LBB6_3:
; GFX1032-NEXT: s_endpgm
@@ -3690,27 +3690,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB6_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB6_2
; GFX1164-NEXT: .LBB6_3:
; GFX1164-NEXT: s_endpgm
@@ -3718,30 +3718,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB6_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB6_2
; GFX1132-NEXT: .LBB6_3:
; GFX1132-NEXT: s_endpgm
@@ -3754,11 +3754,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
@@ -3766,13 +3766,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX9-DPP-NEXT: .LBB6_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -3785,25 +3785,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1064-DPP-NEXT: .LBB6_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -3811,29 +3811,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1032-DPP-NEXT: .LBB6_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -3847,27 +3847,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1164-DPP-NEXT: .LBB6_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -3875,30 +3875,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1132-DPP-NEXT: .LBB6_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -4456,11 +4456,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB8_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
@@ -4468,13 +4468,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB8_2
; GFX9-NEXT: .LBB8_3:
; GFX9-NEXT: s_endpgm
@@ -4487,25 +4487,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB8_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB8_2
; GFX1064-NEXT: .LBB8_3:
; GFX1064-NEXT: s_endpgm
@@ -4513,29 +4513,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB8_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB8_2
; GFX1032-NEXT: .LBB8_3:
; GFX1032-NEXT: s_endpgm
@@ -4549,27 +4549,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB8_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB8_2
; GFX1164-NEXT: .LBB8_3:
; GFX1164-NEXT: s_endpgm
@@ -4577,30 +4577,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB8_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB8_2
; GFX1132-NEXT: .LBB8_3:
; GFX1132-NEXT: s_endpgm
@@ -4613,11 +4613,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
@@ -4625,13 +4625,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX9-DPP-NEXT: .LBB8_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -4644,25 +4644,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX1064-DPP-NEXT: .LBB8_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -4670,29 +4670,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX1032-DPP-NEXT: .LBB8_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -4706,27 +4706,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX1164-DPP-NEXT: .LBB8_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -4734,30 +4734,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX1132-DPP-NEXT: .LBB8_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -5315,11 +5315,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB10_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
@@ -5327,13 +5327,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB10_2
; GFX9-NEXT: .LBB10_3:
; GFX9-NEXT: s_endpgm
@@ -5346,25 +5346,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB10_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB10_2
; GFX1064-NEXT: .LBB10_3:
; GFX1064-NEXT: s_endpgm
@@ -5372,29 +5372,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB10_2
; GFX1032-NEXT: .LBB10_3:
; GFX1032-NEXT: s_endpgm
@@ -5408,27 +5408,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB10_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB10_2
; GFX1164-NEXT: .LBB10_3:
; GFX1164-NEXT: s_endpgm
@@ -5436,30 +5436,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB10_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB10_2
; GFX1132-NEXT: .LBB10_3:
; GFX1132-NEXT: s_endpgm
@@ -5472,11 +5472,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
@@ -5484,13 +5484,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX9-DPP-NEXT: .LBB10_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -5503,25 +5503,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1064-DPP-NEXT: .LBB10_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -5529,29 +5529,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1032-DPP-NEXT: .LBB10_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -5565,27 +5565,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1164-DPP-NEXT: .LBB10_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -5593,30 +5593,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1132-DPP-NEXT: .LBB10_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -6170,23 +6170,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB12_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB12_2
; GFX9-NEXT: .LBB12_3:
; GFX9-NEXT: s_endpgm
@@ -6199,23 +6199,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB12_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB12_2
; GFX1064-NEXT: .LBB12_3:
; GFX1064-NEXT: s_endpgm
@@ -6223,27 +6223,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB12_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB12_2
; GFX1032-NEXT: .LBB12_3:
; GFX1032-NEXT: s_endpgm
@@ -6257,25 +6257,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB12_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB12_2
; GFX1164-NEXT: .LBB12_3:
; GFX1164-NEXT: s_endpgm
@@ -6283,30 +6283,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB12_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB12_2
; GFX1132-NEXT: .LBB12_3:
; GFX1132-NEXT: s_endpgm
@@ -6319,23 +6319,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX9-DPP-NEXT: .LBB12_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -6348,23 +6348,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX1064-DPP-NEXT: .LBB12_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -6372,27 +6372,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX1032-DPP-NEXT: .LBB12_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -6406,25 +6406,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX1164-DPP-NEXT: .LBB12_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -6432,30 +6432,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX1132-DPP-NEXT: .LBB12_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -6505,23 +6505,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB13_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-NEXT: .LBB13_3:
; GFX9-NEXT: s_endpgm
@@ -6534,23 +6534,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB13_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-NEXT: .LBB13_3:
; GFX1064-NEXT: s_endpgm
@@ -6558,27 +6558,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-NEXT: .LBB13_3:
; GFX1032-NEXT: s_endpgm
@@ -6592,25 +6592,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB13_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-NEXT: .LBB13_3:
; GFX1164-NEXT: s_endpgm
@@ -6618,30 +6618,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB13_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-NEXT: .LBB13_3:
; GFX1132-NEXT: s_endpgm
@@ -6654,23 +6654,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-DPP-NEXT: .LBB13_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -6683,23 +6683,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-DPP-NEXT: .LBB13_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -6707,27 +6707,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-DPP-NEXT: .LBB13_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -6741,25 +6741,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-DPP-NEXT: .LBB13_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -6767,30 +6767,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-DPP-NEXT: .LBB13_3:
; GFX1132-DPP-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 5cb5770..1d251f9 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -54,330 +54,330 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
;
; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB0_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB0_2
; GFX9-NEXT: .LBB0_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB0_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB0_2
; GFX1064-NEXT: .LBB0_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB0_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB0_2
; GFX1032-NEXT: .LBB0_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB0_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1164-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB0_2
; GFX1164-NEXT: .LBB0_3:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-NEXT: s_mov_b32 s5, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB0_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s0
; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB0_2
; GFX1132-NEXT: .LBB0_3:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
-; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6
; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX9-DPP-NEXT: .LBB0_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1064-DPP-NEXT: .LBB0_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1032-DPP-NEXT: .LBB0_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1164-DPP-NEXT: .LBB0_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s0
; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1132-DPP-NEXT: .LBB0_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -1223,11 +1223,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
@@ -1235,12 +1235,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB2_2
; GFX9-NEXT: .LBB2_3:
; GFX9-NEXT: s_endpgm
@@ -1262,64 +1262,64 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB2_2
; GFX1064-NEXT: .LBB2_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB2_2
; GFX1032-NEXT: .LBB2_3:
; GFX1032-NEXT: s_endpgm
@@ -1342,27 +1342,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB2_2
; GFX1164-NEXT: .LBB2_3:
; GFX1164-NEXT: s_endpgm
@@ -1373,8 +1373,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -1384,25 +1384,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB2_2
; GFX1132-NEXT: .LBB2_3:
; GFX1132-NEXT: s_endpgm
@@ -1427,11 +1427,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
@@ -1439,12 +1439,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX9-DPP-NEXT: .LBB2_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -1466,64 +1466,64 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1064-DPP-NEXT: .LBB2_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1032-DPP-NEXT: .LBB2_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -1546,27 +1546,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1164-DPP-NEXT: .LBB2_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -1577,8 +1577,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -1588,25 +1588,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1132-DPP-NEXT: .LBB2_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -2453,11 +2453,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
@@ -2465,12 +2465,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB4_2
; GFX9-NEXT: .LBB4_3:
; GFX9-NEXT: s_endpgm
@@ -2492,64 +2492,64 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB4_2
; GFX1064-NEXT: .LBB4_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB4_2
; GFX1032-NEXT: .LBB4_3:
; GFX1032-NEXT: s_endpgm
@@ -2572,27 +2572,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB4_2
; GFX1164-NEXT: .LBB4_3:
; GFX1164-NEXT: s_endpgm
@@ -2603,8 +2603,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -2614,25 +2614,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB4_2
; GFX1132-NEXT: .LBB4_3:
; GFX1132-NEXT: s_endpgm
@@ -2657,11 +2657,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
@@ -2669,12 +2669,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX9-DPP-NEXT: .LBB4_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -2696,64 +2696,64 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1064-DPP-NEXT: .LBB4_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1032-DPP-NEXT: .LBB4_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -2776,27 +2776,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1164-DPP-NEXT: .LBB4_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -2807,8 +2807,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -2818,25 +2818,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1132-DPP-NEXT: .LBB4_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -4455,11 +4455,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
@@ -4467,12 +4467,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX9-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB7_2
; GFX9-NEXT: .LBB7_3:
; GFX9-NEXT: s_endpgm
@@ -4494,64 +4494,64 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB7_2
; GFX1064-NEXT: .LBB7_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB7_2
; GFX1032-NEXT: .LBB7_3:
; GFX1032-NEXT: s_endpgm
@@ -4574,27 +4574,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB7_2
; GFX1164-NEXT: .LBB7_3:
; GFX1164-NEXT: s_endpgm
@@ -4605,8 +4605,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -4616,25 +4616,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB7_2
; GFX1132-NEXT: .LBB7_3:
; GFX1132-NEXT: s_endpgm
@@ -4659,11 +4659,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
@@ -4671,12 +4671,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX9-DPP-NEXT: .LBB7_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -4698,64 +4698,64 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX1064-DPP-NEXT: .LBB7_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX1032-DPP-NEXT: .LBB7_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -4778,27 +4778,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX1164-DPP-NEXT: .LBB7_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -4809,8 +4809,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -4820,25 +4820,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX1132-DPP-NEXT: .LBB7_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -5660,101 +5660,101 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
;
; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB9_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB9_2
; GFX9-NEXT: .LBB9_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB9_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB9_2
; GFX1064-NEXT: .LBB9_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s5
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB9_2
; GFX1032-NEXT: .LBB9_3:
; GFX1032-NEXT: s_endpgm
@@ -5770,165 +5770,165 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1164-NEXT: s_cbranch_execz .LBB9_3
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB9_2
; GFX1164-NEXT: .LBB9_3:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB9_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB9_2
; GFX1132-NEXT: .LBB9_3:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX9-DPP-NEXT: .LBB9_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1064-DPP-NEXT: .LBB9_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s5
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1032-DPP-NEXT: .LBB9_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -5944,64 +5944,64 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1164-DPP-NEXT: .LBB9_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1132-DPP-NEXT: .LBB9_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -6557,11 +6557,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -6569,13 +6569,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX9-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB11_2
; GFX9-NEXT: .LBB11_3:
; GFX9-NEXT: s_endpgm
@@ -6595,68 +6595,68 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB11_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB11_2
; GFX1064-NEXT: .LBB11_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB11_2
; GFX1032-NEXT: .LBB11_3:
; GFX1032-NEXT: s_endpgm
@@ -6679,28 +6679,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB11_2
; GFX1164-NEXT: .LBB11_3:
; GFX1164-NEXT: s_endpgm
@@ -6711,8 +6711,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -6722,25 +6722,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB11_2
; GFX1132-NEXT: .LBB11_3:
; GFX1132-NEXT: s_endpgm
@@ -6765,11 +6765,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
@@ -6777,13 +6777,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX9-DPP-NEXT: .LBB11_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -6803,68 +6803,68 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX1064-DPP-NEXT: .LBB11_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX1032-DPP-NEXT: .LBB11_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -6887,28 +6887,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX1164-DPP-NEXT: .LBB11_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -6919,8 +6919,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -6930,25 +6930,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX1132-DPP-NEXT: .LBB11_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -7503,11 +7503,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -7515,13 +7515,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-NEXT: .LBB13_3:
; GFX9-NEXT: s_endpgm
@@ -7541,68 +7541,68 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB13_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-NEXT: .LBB13_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-NEXT: .LBB13_3:
; GFX1032-NEXT: s_endpgm
@@ -7625,28 +7625,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-NEXT: .LBB13_3:
; GFX1164-NEXT: s_endpgm
@@ -7657,8 +7657,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -7668,25 +7668,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-NEXT: .LBB13_3:
; GFX1132-NEXT: s_endpgm
@@ -7711,11 +7711,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
@@ -7723,13 +7723,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-DPP-NEXT: .LBB13_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -7749,68 +7749,68 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-DPP-NEXT: .LBB13_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-DPP-NEXT: .LBB13_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -7833,28 +7833,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-DPP-NEXT: .LBB13_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -7865,8 +7865,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -7876,25 +7876,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-DPP-NEXT: .LBB13_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -8927,11 +8927,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -8939,13 +8939,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB16_2
; GFX9-NEXT: .LBB16_3:
; GFX9-NEXT: s_endpgm
@@ -8965,68 +8965,68 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB16_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB16_2
; GFX1064-NEXT: .LBB16_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB16_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB16_2
; GFX1032-NEXT: .LBB16_3:
; GFX1032-NEXT: s_endpgm
@@ -9049,28 +9049,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB16_2
; GFX1164-NEXT: .LBB16_3:
; GFX1164-NEXT: s_endpgm
@@ -9081,8 +9081,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -9092,25 +9092,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB16_2
; GFX1132-NEXT: .LBB16_3:
; GFX1132-NEXT: s_endpgm
@@ -9135,11 +9135,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
@@ -9147,13 +9147,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX9-DPP-NEXT: .LBB16_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -9173,68 +9173,68 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1064-DPP-NEXT: .LBB16_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1032-DPP-NEXT: .LBB16_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -9257,28 +9257,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1164-DPP-NEXT: .LBB16_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -9289,8 +9289,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -9300,25 +9300,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1132-DPP-NEXT: .LBB16_3:
; GFX1132-DPP-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 297b518..5abd4c9 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -35,8 +35,8 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_load_dwordx4 s[28:31], s[4:5], 0x0
-; CHECK-NEXT: s_movk_i32 s4, 0x130
-; CHECK-NEXT: s_mov_b32 s5, s24
+; CHECK-NEXT: s_movk_i32 s20, 0x130
+; CHECK-NEXT: s_mov_b32 s21, s24
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_writelane_b32 v4, s36, 0
; CHECK-NEXT: v_writelane_b32 v4, s37, 1
@@ -49,7 +49,7 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_writelane_b32 v4, s44, 8
; CHECK-NEXT: v_writelane_b32 v4, s45, 9
; CHECK-NEXT: v_writelane_b32 v4, s46, 10
-; CHECK-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0
+; CHECK-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0
; CHECK-NEXT: v_writelane_b32 v4, s47, 11
; CHECK-NEXT: v_writelane_b32 v4, s48, 12
; CHECK-NEXT: v_writelane_b32 v4, s49, 13
@@ -78,17 +78,17 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_writelane_b32 v4, s13, 25
; CHECK-NEXT: v_writelane_b32 v4, s14, 26
; CHECK-NEXT: v_writelane_b32 v4, s15, 27
-; CHECK-NEXT: v_writelane_b32 v4, s16, 28
; CHECK-NEXT: v_writelane_b32 v8, s52, 18
-; CHECK-NEXT: v_writelane_b32 v4, s17, 29
+; CHECK-NEXT: v_writelane_b32 v4, s16, 28
; CHECK-NEXT: v_writelane_b32 v8, s53, 19
-; CHECK-NEXT: v_writelane_b32 v4, s18, 30
+; CHECK-NEXT: v_writelane_b32 v4, s17, 29
; CHECK-NEXT: v_writelane_b32 v8, s54, 20
-; CHECK-NEXT: v_writelane_b32 v4, s19, 31
-; CHECK-NEXT: s_mov_b32 s4, 48
-; CHECK-NEXT: s_mov_b32 s5, s24
+; CHECK-NEXT: v_writelane_b32 v4, s18, 30
+; CHECK-NEXT: s_mov_b32 s26, 48
+; CHECK-NEXT: s_mov_b32 s27, s24
; CHECK-NEXT: v_writelane_b32 v8, s55, 21
-; CHECK-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; CHECK-NEXT: v_writelane_b32 v4, s19, 31
+; CHECK-NEXT: s_load_dwordx8 s[4:11], s[26:27], 0x0
; CHECK-NEXT: v_writelane_b32 v8, s56, 22
; CHECK-NEXT: v_writelane_b32 v8, s57, 23
; CHECK-NEXT: v_writelane_b32 v8, s58, 24
@@ -107,15 +107,15 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_writelane_b32 v8, s65, 31
; CHECK-NEXT: v_writelane_b32 v4, s9, 37
; CHECK-NEXT: v_writelane_b32 v8, s66, 32
-; CHECK-NEXT: s_movk_i32 s26, 0x1f0
-; CHECK-NEXT: s_movk_i32 s28, 0x2f0
-; CHECK-NEXT: s_mov_b32 s27, s24
+; CHECK-NEXT: s_movk_i32 s28, 0x1f0
+; CHECK-NEXT: s_movk_i32 s30, 0x2f0
; CHECK-NEXT: s_mov_b32 s29, s24
+; CHECK-NEXT: s_mov_b32 s31, s24
; CHECK-NEXT: v_writelane_b32 v4, s10, 38
; CHECK-NEXT: v_writelane_b32 v8, s67, 33
; CHECK-NEXT: v_writelane_b32 v4, s11, 39
-; CHECK-NEXT: s_load_dwordx16 s[52:67], s[26:27], 0x0
-; CHECK-NEXT: s_load_dwordx16 s[4:19], s[28:29], 0x0
+; CHECK-NEXT: s_load_dwordx16 s[52:67], s[28:29], 0x0
+; CHECK-NEXT: s_load_dwordx16 s[4:19], s[30:31], 0x0
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; CHECK-NEXT: s_xor_b64 s[24:25], vcc, -1
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 7ee31bf..c6342e5 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -7,9 +7,9 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-LABEL: udiv32_invariant_denom:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: s_sub_i32 s4, 0, s6
@@ -36,15 +36,15 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-NEXT: s_add_i32 s10, s11, 1
; GFX9-NEXT: s_cmp_ge_u32 s9, s6
; GFX9-NEXT: s_cselect_b32 s9, s10, s11
-; GFX9-NEXT: s_add_u32 s10, s0, s2
-; GFX9-NEXT: s_addc_u32 s11, s1, s3
+; GFX9-NEXT: s_add_u32 s10, s2, s0
+; GFX9-NEXT: s_addc_u32 s11, s3, s1
; GFX9-NEXT: s_add_i32 s7, s7, 1
; GFX9-NEXT: s_add_u32 s4, s4, s8
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_add_u32 s2, s2, 4
-; GFX9-NEXT: s_addc_u32 s3, s3, 0
+; GFX9-NEXT: s_add_u32 s0, s0, 4
+; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000
+; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x1000
; GFX9-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NEXT: s_cbranch_scc0 .LBB0_1
; GFX9-NEXT: ; %bb.2: ; %bb2
@@ -52,20 +52,21 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
;
; GFX10-LABEL: udiv32_invariant_denom:
; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s6, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_mov_b32 s7, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX10-NEXT: s_sub_i32 s2, 0, s6
+; GFX10-NEXT: s_sub_i32 s0, 0, s6
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_mul_i32 s2, s2, s4
-; GFX10-NEXT: s_mul_hi_u32 s5, s4, s2
-; GFX10-NEXT: s_mov_b64 s[2:3], 0
+; GFX10-NEXT: s_mul_i32 s0, s0, s4
+; GFX10-NEXT: s_mul_hi_u32 s5, s4, s0
+; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: s_add_i32 s8, s4, s5
; GFX10-NEXT: s_mov_b64 s[4:5], 0
; GFX10-NEXT: .LBB0_1: ; %bb3
@@ -83,15 +84,15 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX10-NEXT: s_add_i32 s10, s11, 1
; GFX10-NEXT: s_cmp_ge_u32 s9, s6
; GFX10-NEXT: s_cselect_b32 s9, s10, s11
-; GFX10-NEXT: s_add_u32 s10, s0, s2
-; GFX10-NEXT: s_addc_u32 s11, s1, s3
+; GFX10-NEXT: s_add_u32 s10, s2, s0
+; GFX10-NEXT: s_addc_u32 s11, s3, s1
; GFX10-NEXT: s_add_i32 s7, s7, 1
; GFX10-NEXT: s_add_u32 s4, s4, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: s_addc_u32 s5, s5, 0
-; GFX10-NEXT: s_add_u32 s2, s2, 4
-; GFX10-NEXT: s_addc_u32 s3, s3, 0
-; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000
+; GFX10-NEXT: s_add_u32 s0, s0, 4
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x1000
; GFX10-NEXT: global_store_dword v0, v1, s[10:11]
; GFX10-NEXT: s_cbranch_scc0 .LBB0_1
; GFX10-NEXT: ; %bb.2: ; %bb2
@@ -101,11 +102,11 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_mov_b32 s7, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX11-NEXT: s_sub_i32 s2, 0, s6
+; GFX11-NEXT: s_sub_i32 s0, 0, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -114,10 +115,10 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_mul_i32 s2, s2, s4
+; GFX11-NEXT: s_mul_i32 s0, s0, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2
-; GFX11-NEXT: s_mov_b64 s[2:3], 0
+; GFX11-NEXT: s_mul_hi_u32 s5, s4, s0
+; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_add_i32 s8, s4, s5
; GFX11-NEXT: s_mov_b64 s[4:5], 0
; GFX11-NEXT: .p2align 6
@@ -136,15 +137,15 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_add_i32 s10, s11, 1
; GFX11-NEXT: s_cmp_ge_u32 s9, s6
; GFX11-NEXT: s_cselect_b32 s9, s10, s11
-; GFX11-NEXT: s_add_u32 s10, s0, s2
-; GFX11-NEXT: s_addc_u32 s11, s1, s3
+; GFX11-NEXT: s_add_u32 s10, s2, s0
+; GFX11-NEXT: s_addc_u32 s11, s3, s1
; GFX11-NEXT: s_add_i32 s7, s7, 1
; GFX11-NEXT: s_add_u32 s4, s4, s8
; GFX11-NEXT: v_mov_b32_e32 v1, s9
; GFX11-NEXT: s_addc_u32 s5, s5, 0
-; GFX11-NEXT: s_add_u32 s2, s2, 4
-; GFX11-NEXT: s_addc_u32 s3, s3, 0
-; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000
+; GFX11-NEXT: s_add_u32 s0, s0, 4
+; GFX11-NEXT: s_addc_u32 s1, s1, 0
+; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x1000
; GFX11-NEXT: global_store_b32 v0, v1, s[10:11]
; GFX11-NEXT: s_cbranch_scc0 .LBB0_1
; GFX11-NEXT: ; %bb.2: ; %bb2
@@ -172,9 +173,9 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-LABEL: urem32_invariant_denom:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: s_sub_i32 s4, 0, s6
@@ -199,15 +200,15 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-NEXT: s_sub_i32 s10, s9, s6
; GFX9-NEXT: s_cmp_ge_u32 s9, s6
; GFX9-NEXT: s_cselect_b32 s9, s10, s9
-; GFX9-NEXT: s_add_u32 s10, s0, s2
-; GFX9-NEXT: s_addc_u32 s11, s1, s3
+; GFX9-NEXT: s_add_u32 s10, s2, s0
+; GFX9-NEXT: s_addc_u32 s11, s3, s1
; GFX9-NEXT: s_add_i32 s7, s7, 1
; GFX9-NEXT: s_add_u32 s4, s4, s8
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_add_u32 s2, s2, 4
-; GFX9-NEXT: s_addc_u32 s3, s3, 0
+; GFX9-NEXT: s_add_u32 s0, s0, 4
+; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000
+; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x1000
; GFX9-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NEXT: s_cbranch_scc0 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %bb2
@@ -215,20 +216,21 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
;
; GFX10-LABEL: urem32_invariant_denom:
; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s6, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_mov_b32 s7, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX10-NEXT: s_sub_i32 s2, 0, s6
+; GFX10-NEXT: s_sub_i32 s0, 0, s6
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_mul_i32 s2, s2, s4
-; GFX10-NEXT: s_mul_hi_u32 s5, s4, s2
-; GFX10-NEXT: s_mov_b64 s[2:3], 0
+; GFX10-NEXT: s_mul_i32 s0, s0, s4
+; GFX10-NEXT: s_mul_hi_u32 s5, s4, s0
+; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: s_add_i32 s8, s4, s5
; GFX10-NEXT: s_mov_b64 s[4:5], 0
; GFX10-NEXT: .LBB1_1: ; %bb3
@@ -244,15 +246,15 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX10-NEXT: s_sub_i32 s10, s9, s6
; GFX10-NEXT: s_cmp_ge_u32 s9, s6
; GFX10-NEXT: s_cselect_b32 s9, s10, s9
-; GFX10-NEXT: s_add_u32 s10, s0, s2
-; GFX10-NEXT: s_addc_u32 s11, s1, s3
+; GFX10-NEXT: s_add_u32 s10, s2, s0
+; GFX10-NEXT: s_addc_u32 s11, s3, s1
; GFX10-NEXT: s_add_i32 s7, s7, 1
; GFX10-NEXT: s_add_u32 s4, s4, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: s_addc_u32 s5, s5, 0
-; GFX10-NEXT: s_add_u32 s2, s2, 4
-; GFX10-NEXT: s_addc_u32 s3, s3, 0
-; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000
+; GFX10-NEXT: s_add_u32 s0, s0, 4
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x1000
; GFX10-NEXT: global_store_dword v0, v1, s[10:11]
; GFX10-NEXT: s_cbranch_scc0 .LBB1_1
; GFX10-NEXT: ; %bb.2: ; %bb2
@@ -262,11 +264,11 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_mov_b32 s7, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX11-NEXT: s_sub_i32 s2, 0, s6
+; GFX11-NEXT: s_sub_i32 s0, 0, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -275,10 +277,10 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_mul_i32 s2, s2, s4
+; GFX11-NEXT: s_mul_i32 s0, s0, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2
-; GFX11-NEXT: s_mov_b64 s[2:3], 0
+; GFX11-NEXT: s_mul_hi_u32 s5, s4, s0
+; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_add_i32 s8, s4, s5
; GFX11-NEXT: s_mov_b64 s[4:5], 0
; GFX11-NEXT: .p2align 6
@@ -296,15 +298,15 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_sub_i32 s10, s9, s6
; GFX11-NEXT: s_cmp_ge_u32 s9, s6
; GFX11-NEXT: s_cselect_b32 s9, s10, s9
-; GFX11-NEXT: s_add_u32 s10, s0, s2
-; GFX11-NEXT: s_addc_u32 s11, s1, s3
+; GFX11-NEXT: s_add_u32 s10, s2, s0
+; GFX11-NEXT: s_addc_u32 s11, s3, s1
; GFX11-NEXT: s_add_i32 s7, s7, 1
; GFX11-NEXT: s_add_u32 s4, s4, s8
; GFX11-NEXT: v_mov_b32_e32 v1, s9
; GFX11-NEXT: s_addc_u32 s5, s5, 0
-; GFX11-NEXT: s_add_u32 s2, s2, 4
-; GFX11-NEXT: s_addc_u32 s3, s3, 0
-; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000
+; GFX11-NEXT: s_add_u32 s0, s0, 4
+; GFX11-NEXT: s_addc_u32 s1, s1, 0
+; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x1000
; GFX11-NEXT: global_store_b32 v0, v1, s[10:11]
; GFX11-NEXT: s_cbranch_scc0 .LBB1_1
; GFX11-NEXT: ; %bb.2: ; %bb2
@@ -331,14 +333,14 @@ bb3: ; preds = %bb3, %bb
define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) {
; GFX9-LABEL: sdiv32_invariant_denom:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX9-NEXT: s_mov_b32 s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dword s5, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_abs_i32 s2, s4
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX9-NEXT: s_sub_i32 s5, 0, s2
-; GFX9-NEXT: s_ashr_i32 s4, s4, 31
+; GFX9-NEXT: s_abs_i32 s4, s5
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT: s_ashr_i32 s1, s5, 31
+; GFX9-NEXT: s_sub_i32 s5, 0, s4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
@@ -349,70 +351,70 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: .LBB2_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_mul_hi_u32 s6, s3, s5
-; GFX9-NEXT: s_mul_i32 s7, s6, s2
-; GFX9-NEXT: s_sub_i32 s7, s3, s7
+; GFX9-NEXT: s_mul_hi_u32 s6, s0, s5
+; GFX9-NEXT: s_mul_i32 s7, s6, s4
+; GFX9-NEXT: s_sub_i32 s7, s0, s7
; GFX9-NEXT: s_add_i32 s8, s6, 1
-; GFX9-NEXT: s_sub_i32 s9, s7, s2
-; GFX9-NEXT: s_cmp_ge_u32 s7, s2
+; GFX9-NEXT: s_sub_i32 s9, s7, s4
+; GFX9-NEXT: s_cmp_ge_u32 s7, s4
; GFX9-NEXT: s_cselect_b32 s6, s8, s6
; GFX9-NEXT: s_cselect_b32 s7, s9, s7
; GFX9-NEXT: s_add_i32 s8, s6, 1
-; GFX9-NEXT: s_cmp_ge_u32 s7, s2
+; GFX9-NEXT: s_cmp_ge_u32 s7, s4
; GFX9-NEXT: s_cselect_b32 s6, s8, s6
-; GFX9-NEXT: s_xor_b32 s6, s6, s4
-; GFX9-NEXT: s_sub_i32 s6, s6, s4
-; GFX9-NEXT: s_add_i32 s3, s3, 1
+; GFX9-NEXT: s_xor_b32 s6, s6, s1
+; GFX9-NEXT: s_sub_i32 s6, s6, s1
+; GFX9-NEXT: s_add_i32 s0, s0, 1
; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, 4
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: s_add_u32 s2, s2, 4
+; GFX9-NEXT: s_addc_u32 s3, s3, 0
+; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400
; GFX9-NEXT: s_cbranch_scc0 .LBB2_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sdiv32_invariant_denom:
; GFX10: ; %bb.0: ; %bb
-; GFX10-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dword s5, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_abs_i32 s2, s3
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX10-NEXT: s_sub_i32 s4, 0, s2
-; GFX10-NEXT: s_ashr_i32 s3, s3, 31
+; GFX10-NEXT: s_abs_i32 s4, s5
+; GFX10-NEXT: s_ashr_i32 s0, s5, 31
+; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX10-NEXT: s_sub_i32 s1, 0, s4
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s5, v0
+; GFX10-NEXT: v_readfirstlane_b32 s6, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_mul_i32 s4, s4, s5
-; GFX10-NEXT: s_mul_hi_u32 s6, s5, s4
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: s_add_i32 s5, s5, s6
+; GFX10-NEXT: s_mul_i32 s1, s1, s6
+; GFX10-NEXT: s_mul_hi_u32 s5, s6, s1
+; GFX10-NEXT: s_mov_b32 s1, 0
+; GFX10-NEXT: s_add_i32 s5, s6, s5
; GFX10-NEXT: .LBB2_1: ; %bb3
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_mul_hi_u32 s6, s4, s5
-; GFX10-NEXT: s_mul_i32 s7, s6, s2
+; GFX10-NEXT: s_mul_hi_u32 s6, s1, s5
+; GFX10-NEXT: s_mul_i32 s7, s6, s4
; GFX10-NEXT: s_add_i32 s8, s6, 1
-; GFX10-NEXT: s_sub_i32 s7, s4, s7
-; GFX10-NEXT: s_sub_i32 s9, s7, s2
-; GFX10-NEXT: s_cmp_ge_u32 s7, s2
+; GFX10-NEXT: s_sub_i32 s7, s1, s7
+; GFX10-NEXT: s_sub_i32 s9, s7, s4
+; GFX10-NEXT: s_cmp_ge_u32 s7, s4
; GFX10-NEXT: s_cselect_b32 s6, s8, s6
; GFX10-NEXT: s_cselect_b32 s7, s9, s7
; GFX10-NEXT: s_add_i32 s8, s6, 1
-; GFX10-NEXT: s_cmp_ge_u32 s7, s2
+; GFX10-NEXT: s_cmp_ge_u32 s7, s4
; GFX10-NEXT: s_cselect_b32 s6, s8, s6
-; GFX10-NEXT: s_add_i32 s4, s4, 1
-; GFX10-NEXT: s_xor_b32 s6, s6, s3
-; GFX10-NEXT: s_sub_i32 s6, s6, s3
+; GFX10-NEXT: s_add_i32 s1, s1, 1
+; GFX10-NEXT: s_xor_b32 s6, s6, s0
+; GFX10-NEXT: s_sub_i32 s6, s6, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_add_u32 s0, s0, 4
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400
+; GFX10-NEXT: s_add_u32 s2, s2, 4
+; GFX10-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-NEXT: s_cmpk_eq_i32 s1, 0x400
; GFX10-NEXT: s_cbranch_scc0 .LBB2_1
; GFX10-NEXT: ; %bb.2: ; %bb2
; GFX10-NEXT: s_endpgm
@@ -420,51 +422,51 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-LABEL: sdiv32_invariant_denom:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_abs_i32 s2, s3
-; GFX11-NEXT: s_ashr_i32 s3, s3, 31
-; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX11-NEXT: s_sub_i32 s4, 0, s2
+; GFX11-NEXT: s_abs_i32 s4, s5
+; GFX11-NEXT: s_ashr_i32 s0, s5, 31
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX11-NEXT: s_sub_i32 s1, 0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_readfirstlane_b32 s5, v0
+; GFX11-NEXT: v_readfirstlane_b32 s6, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_mul_i32 s4, s4, s5
+; GFX11-NEXT: s_mul_i32 s1, s1, s6
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_mul_hi_u32 s6, s5, s4
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_add_i32 s5, s5, s6
+; GFX11-NEXT: s_mul_hi_u32 s5, s6, s1
+; GFX11-NEXT: s_mov_b32 s1, 0
+; GFX11-NEXT: s_add_i32 s5, s6, s5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB2_1: ; %bb3
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_mul_hi_u32 s6, s4, s5
-; GFX11-NEXT: s_mul_i32 s7, s6, s2
+; GFX11-NEXT: s_mul_hi_u32 s6, s1, s5
+; GFX11-NEXT: s_mul_i32 s7, s6, s4
; GFX11-NEXT: s_add_i32 s8, s6, 1
-; GFX11-NEXT: s_sub_i32 s7, s4, s7
+; GFX11-NEXT: s_sub_i32 s7, s1, s7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_sub_i32 s9, s7, s2
-; GFX11-NEXT: s_cmp_ge_u32 s7, s2
+; GFX11-NEXT: s_sub_i32 s9, s7, s4
+; GFX11-NEXT: s_cmp_ge_u32 s7, s4
; GFX11-NEXT: s_cselect_b32 s6, s8, s6
; GFX11-NEXT: s_cselect_b32 s7, s9, s7
; GFX11-NEXT: s_add_i32 s8, s6, 1
-; GFX11-NEXT: s_cmp_ge_u32 s7, s2
+; GFX11-NEXT: s_cmp_ge_u32 s7, s4
; GFX11-NEXT: s_cselect_b32 s6, s8, s6
-; GFX11-NEXT: s_add_i32 s4, s4, 1
-; GFX11-NEXT: s_xor_b32 s6, s6, s3
+; GFX11-NEXT: s_add_i32 s1, s1, 1
+; GFX11-NEXT: s_xor_b32 s6, s6, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_sub_i32 s6, s6, s3
+; GFX11-NEXT: s_sub_i32 s6, s6, s0
; GFX11-NEXT: v_mov_b32_e32 v1, s6
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, 4
-; GFX11-NEXT: s_addc_u32 s1, s1, 0
-; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-NEXT: s_add_u32 s2, s2, 4
+; GFX11-NEXT: s_addc_u32 s3, s3, 0
+; GFX11-NEXT: s_cmpk_eq_i32 s1, 0x400
; GFX11-NEXT: s_cbranch_scc0 .LBB2_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_nop 0
@@ -491,37 +493,38 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-LABEL: srem32_invariant_denom:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT: s_mov_b32 s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_abs_i32 s2, s2
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX9-NEXT: s_sub_i32 s4, 0, s2
+; GFX9-NEXT: s_abs_i32 s4, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_sub_i32 s1, 0, s4
+; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_readfirstlane_b32 s5, v0
-; GFX9-NEXT: s_mul_i32 s4, s4, s5
-; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
-; GFX9-NEXT: s_add_i32 s4, s5, s4
+; GFX9-NEXT: s_mul_i32 s1, s1, s5
+; GFX9-NEXT: s_mul_hi_u32 s1, s5, s1
+; GFX9-NEXT: s_add_i32 s1, s5, s1
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: .LBB3_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_mul_hi_u32 s5, s3, s4
-; GFX9-NEXT: s_mul_i32 s5, s5, s2
-; GFX9-NEXT: s_sub_i32 s5, s3, s5
-; GFX9-NEXT: s_sub_i32 s6, s5, s2
-; GFX9-NEXT: s_cmp_ge_u32 s5, s2
+; GFX9-NEXT: s_mul_hi_u32 s5, s0, s1
+; GFX9-NEXT: s_mul_i32 s5, s5, s4
+; GFX9-NEXT: s_sub_i32 s5, s0, s5
+; GFX9-NEXT: s_sub_i32 s6, s5, s4
+; GFX9-NEXT: s_cmp_ge_u32 s5, s4
; GFX9-NEXT: s_cselect_b32 s5, s6, s5
-; GFX9-NEXT: s_sub_i32 s6, s5, s2
-; GFX9-NEXT: s_cmp_ge_u32 s5, s2
+; GFX9-NEXT: s_sub_i32 s6, s5, s4
+; GFX9-NEXT: s_cmp_ge_u32 s5, s4
; GFX9-NEXT: s_cselect_b32 s5, s6, s5
-; GFX9-NEXT: s_add_i32 s3, s3, 1
+; GFX9-NEXT: s_add_i32 s0, s0, 1
; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, 4
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: s_add_u32 s2, s2, 4
+; GFX9-NEXT: s_addc_u32 s3, s3, 0
+; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400
; GFX9-NEXT: s_cbranch_scc0 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
@@ -530,85 +533,85 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_abs_i32 s2, s2
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX10-NEXT: s_sub_i32 s3, 0, s2
+; GFX10-NEXT: s_abs_i32 s4, s2
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX10-NEXT: s_sub_i32 s0, 0, s4
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_mul_i32 s3, s3, s4
-; GFX10-NEXT: s_mul_hi_u32 s5, s4, s3
-; GFX10-NEXT: s_mov_b32 s3, 0
-; GFX10-NEXT: s_add_i32 s4, s4, s5
+; GFX10-NEXT: s_mul_i32 s0, s0, s1
+; GFX10-NEXT: s_mul_hi_u32 s5, s1, s0
+; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_add_i32 s1, s1, s5
; GFX10-NEXT: .LBB3_1: ; %bb3
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_mul_hi_u32 s5, s3, s4
-; GFX10-NEXT: s_mul_i32 s5, s5, s2
-; GFX10-NEXT: s_sub_i32 s5, s3, s5
-; GFX10-NEXT: s_sub_i32 s6, s5, s2
-; GFX10-NEXT: s_cmp_ge_u32 s5, s2
+; GFX10-NEXT: s_mul_hi_u32 s5, s0, s1
+; GFX10-NEXT: s_mul_i32 s5, s5, s4
+; GFX10-NEXT: s_sub_i32 s5, s0, s5
+; GFX10-NEXT: s_sub_i32 s6, s5, s4
+; GFX10-NEXT: s_cmp_ge_u32 s5, s4
; GFX10-NEXT: s_cselect_b32 s5, s6, s5
-; GFX10-NEXT: s_sub_i32 s6, s5, s2
-; GFX10-NEXT: s_cmp_ge_u32 s5, s2
+; GFX10-NEXT: s_sub_i32 s6, s5, s4
+; GFX10-NEXT: s_cmp_ge_u32 s5, s4
; GFX10-NEXT: s_cselect_b32 s5, s6, s5
-; GFX10-NEXT: s_add_i32 s3, s3, 1
+; GFX10-NEXT: s_add_i32 s0, s0, 1
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_add_u32 s0, s0, 4
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: s_cmpk_eq_i32 s3, 0x400
+; GFX10-NEXT: s_add_u32 s2, s2, 4
+; GFX10-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400
; GFX10-NEXT: s_cbranch_scc0 .LBB3_1
; GFX10-NEXT: ; %bb.2: ; %bb2
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: srem32_invariant_denom:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_abs_i32 s2, s2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX11-NEXT: s_sub_i32 s3, 0, s2
+; GFX11-NEXT: s_abs_i32 s4, s2
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX11-NEXT: s_sub_i32 s0, 0, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_mul_i32 s3, s3, s4
-; GFX11-NEXT: s_mul_hi_u32 s5, s4, s3
-; GFX11-NEXT: s_mov_b32 s3, 0
-; GFX11-NEXT: s_add_i32 s4, s4, s5
+; GFX11-NEXT: s_mul_i32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mul_hi_u32 s5, s1, s0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_add_i32 s1, s1, s5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB3_1: ; %bb3
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_mul_hi_u32 s5, s3, s4
-; GFX11-NEXT: s_mul_i32 s5, s5, s2
+; GFX11-NEXT: s_mul_hi_u32 s5, s0, s1
+; GFX11-NEXT: s_mul_i32 s5, s5, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_sub_i32 s5, s3, s5
-; GFX11-NEXT: s_sub_i32 s6, s5, s2
-; GFX11-NEXT: s_cmp_ge_u32 s5, s2
+; GFX11-NEXT: s_sub_i32 s5, s0, s5
+; GFX11-NEXT: s_sub_i32 s6, s5, s4
+; GFX11-NEXT: s_cmp_ge_u32 s5, s4
; GFX11-NEXT: s_cselect_b32 s5, s6, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_sub_i32 s6, s5, s2
-; GFX11-NEXT: s_cmp_ge_u32 s5, s2
+; GFX11-NEXT: s_sub_i32 s6, s5, s4
+; GFX11-NEXT: s_cmp_ge_u32 s5, s4
; GFX11-NEXT: s_cselect_b32 s5, s6, s5
-; GFX11-NEXT: s_add_i32 s3, s3, 1
+; GFX11-NEXT: s_add_i32 s0, s0, 1
; GFX11-NEXT: v_mov_b32_e32 v1, s5
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, 4
-; GFX11-NEXT: s_addc_u32 s1, s1, 0
-; GFX11-NEXT: s_cmpk_eq_i32 s3, 0x400
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-NEXT: s_add_u32 s2, s2, 4
+; GFX11-NEXT: s_addc_u32 s3, s3, 0
+; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400
; GFX11-NEXT: s_cbranch_scc0 .LBB3_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_nop 0
@@ -748,12 +751,12 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_movk_i32 s3, 0x400
; GFX9-NEXT: v_mov_b32_e32 v3, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
+; GFX9-NEXT: s_and_b32 s4, s2, 0xffff
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_movk_i32 s0, 0x400
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GFX9-NEXT: .LBB5_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -765,11 +768,12 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v0
; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
-; GFX9-NEXT: v_mul_lo_u32 v4, v4, s2
+; GFX9-NEXT: v_mul_lo_u32 v4, v4, s4
; GFX9-NEXT: v_sub_u32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_add_u16_e32 v2, 1, v2
-; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2
-; GFX9-NEXT: global_store_short v5, v4, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_short v5, v4, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
@@ -807,13 +811,13 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-LABEL: urem16_invariant_denom:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT: s_and_b32 s0, s4, 0xffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s0
; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB5_1: ; %bb3
@@ -833,10 +837,10 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 1, v3
-; GFX11-NEXT: v_mul_lo_u32 v4, v4, s2
+; GFX11-NEXT: v_mul_lo_u32 v4, v4, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_sub_nc_u32_e32 v3, v3, v4
-; GFX11-NEXT: global_store_b16 v5, v3, s[0:1]
+; GFX11-NEXT: global_store_b16 v5, v3, s[2:3]
; GFX11-NEXT: s_cbranch_vccz .LBB5_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_nop 0
@@ -863,18 +867,18 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-LABEL: sdiv16_invariant_denom:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT: s_mov_b32 s4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_movk_i32 s3, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sext_i32_i16 s2, s2
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
+; GFX9-NEXT: s_sext_i32_i16 s4, s2
+; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s1, 0
+; GFX9-NEXT: s_movk_i32 s0, 0x400
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GFX9-NEXT: .LBB6_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_sext_i32_i16 s5, s4
+; GFX9-NEXT: s_sext_i32_i16 s5, s1
; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5
-; GFX9-NEXT: s_xor_b32 s6, s5, s2
+; GFX9-NEXT: s_xor_b32 s6, s5, s4
; GFX9-NEXT: s_ashr_i32 s5, s6, 30
; GFX9-NEXT: s_or_b32 s5, s5, 1
; GFX9-NEXT: v_mul_f32_e32 v4, v3, v1
@@ -883,15 +887,16 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v0|
; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec
-; GFX9-NEXT: v_add_u16_e64 v2, s4, 1
+; GFX9-NEXT: v_add_u16_e64 v2, s1, 1
; GFX9-NEXT: s_cselect_b32 s5, s5, 0
-; GFX9-NEXT: s_and_b32 s6, 0xffff, s4
-; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
+; GFX9-NEXT: s_and_b32 s6, 0xffff, s1
+; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX9-NEXT: v_readfirstlane_b32 s1, v2
; GFX9-NEXT: v_add_u32_e32 v2, s5, v4
; GFX9-NEXT: s_lshl_b32 s5, s6, 1
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: global_store_short v3, v2, s[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_short v3, v2, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB6_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
@@ -935,21 +940,21 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-LABEL: sdiv16_invariant_denom:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sext_i32_i16 s2, s2
+; GFX11-NEXT: s_sext_i32_i16 s0, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
+; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s0
; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB6_1: ; %bb3
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_sext_i32_i16 s4, s3
-; GFX11-NEXT: v_add_nc_u16 v2, s3, 1
+; GFX11-NEXT: s_sext_i32_i16 s4, s1
+; GFX11-NEXT: v_add_nc_u16 v2, s1, 1
; GFX11-NEXT: v_cvt_f32_i32_e32 v3, s4
-; GFX11-NEXT: s_xor_b32 s5, s4, s2
+; GFX11-NEXT: s_xor_b32 s5, s4, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: s_ashr_i32 s4, s5, 30
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
@@ -964,12 +969,12 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: v_cmp_ge_f32_e64 s5, |v3|, |v0|
; GFX11-NEXT: s_and_b32 s5, s5, exec_lo
; GFX11-NEXT: s_cselect_b32 s4, s4, 0
-; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
-; GFX11-NEXT: v_readfirstlane_b32 s3, v2
+; GFX11-NEXT: s_and_b32 s5, 0xffff, s1
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-NEXT: s_lshl_b32 s5, s5, 1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v3, s5 :: v_dual_add_nc_u32 v2, s4, v4
-; GFX11-NEXT: global_store_b16 v3, v2, s[0:1]
+; GFX11-NEXT: global_store_b16 v3, v2, s[2:3]
; GFX11-NEXT: s_cbranch_vccz .LBB6_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_nop 0
@@ -996,18 +1001,18 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-LABEL: srem16_invariant_denom:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT: s_mov_b32 s4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_movk_i32 s3, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sext_i32_i16 s2, s2
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
+; GFX9-NEXT: s_sext_i32_i16 s4, s2
+; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s1, 0
+; GFX9-NEXT: s_movk_i32 s0, 0x400
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GFX9-NEXT: .LBB7_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_sext_i32_i16 s5, s4
+; GFX9-NEXT: s_sext_i32_i16 s5, s1
; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5
-; GFX9-NEXT: s_xor_b32 s6, s5, s2
+; GFX9-NEXT: s_xor_b32 s6, s5, s4
; GFX9-NEXT: s_ashr_i32 s6, s6, 30
; GFX9-NEXT: s_or_b32 s8, s6, 1
; GFX9-NEXT: v_mul_f32_e32 v4, v3, v1
@@ -1016,17 +1021,19 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v0|
; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec
-; GFX9-NEXT: v_add_u16_e64 v2, s4, 1
+; GFX9-NEXT: v_add_u16_e64 v2, s1, 1
; GFX9-NEXT: s_cselect_b32 s6, s8, 0
-; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2
-; GFX9-NEXT: s_and_b32 s7, 0xffff, s4
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
+; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX9-NEXT: s_and_b32 s7, 0xffff, s1
+; GFX9-NEXT: v_readfirstlane_b32 s1, v2
; GFX9-NEXT: v_add_u32_e32 v2, s6, v4
-; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2
+; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4
; GFX9-NEXT: s_lshl_b32 s6, s7, 1
+; GFX9-NEXT: s_and_b64 vcc, exec, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s6
; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2
-; GFX9-NEXT: global_store_short v3, v2, s[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_short v3, v2, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
@@ -1073,21 +1080,21 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-LABEL: srem16_invariant_denom:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sext_i32_i16 s2, s2
+; GFX11-NEXT: s_sext_i32_i16 s0, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
+; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s0
; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB7_1: ; %bb3
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_sext_i32_i16 s4, s3
-; GFX11-NEXT: v_add_nc_u16 v2, s3, 1
+; GFX11-NEXT: s_sext_i32_i16 s4, s1
+; GFX11-NEXT: v_add_nc_u16 v2, s1, 1
; GFX11-NEXT: v_cvt_f32_i32_e32 v3, s4
-; GFX11-NEXT: s_xor_b32 s5, s4, s2
+; GFX11-NEXT: s_xor_b32 s5, s4, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: s_ashr_i32 s5, s5, 30
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
@@ -1105,14 +1112,14 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_add_nc_u32_e32 v3, s5, v3
-; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
-; GFX11-NEXT: v_readfirstlane_b32 s3, v2
+; GFX11-NEXT: s_and_b32 s5, 0xffff, s1
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-NEXT: s_lshl_b32 s5, s5, 1
; GFX11-NEXT: v_mov_b32_e32 v2, s5
-; GFX11-NEXT: v_mul_lo_u32 v3, v3, s2
+; GFX11-NEXT: v_mul_lo_u32 v3, v3, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_sub_nc_u32_e32 v3, s4, v3
-; GFX11-NEXT: global_store_b16 v2, v3, s[0:1]
+; GFX11-NEXT: global_store_b16 v2, v3, s[2:3]
; GFX11-NEXT: s_cbranch_vccz .LBB7_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
index 9da07ea..06a5816 100644
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -41,7 +41,7 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -52,7 +52,7 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -60,10 +60,10 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -102,18 +102,19 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot2:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -179,7 +180,7 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_MulMul:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -190,7 +191,7 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -198,9 +199,9 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -242,21 +243,22 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot2_MulMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, s2
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, s0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -317,7 +319,7 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
; GFX8-LABEL: idot2:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -328,7 +330,7 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
@@ -336,10 +338,10 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -378,18 +380,19 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot2:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -451,7 +454,7 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
; GFX8-LABEL: idot2_MixedTypedMul:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -462,7 +465,7 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -470,10 +473,10 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -515,21 +518,22 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot2_MixedTypedMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -591,7 +595,7 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_alt_AddOperands:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -602,7 +606,7 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -610,10 +614,10 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -656,18 +660,19 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot2_alt_AddOperands:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -729,7 +734,7 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
; GFX8-LABEL: idot2_MixedExt:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -740,7 +745,7 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
@@ -748,10 +753,10 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -793,21 +798,22 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot2_MixedExt:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -867,7 +873,7 @@ define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1,
; GFX8-LABEL: notudot2_SameVec:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -878,16 +884,16 @@ define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v0, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, v0, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v1, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -931,22 +937,23 @@ define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: notudot2_SameVec:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1009,7 +1016,7 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_v4i16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1020,7 +1027,7 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: flat_load_dword v1, v[2:3]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1028,10 +1035,10 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1070,18 +1077,19 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot2_v4i16:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1143,7 +1151,7 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_v4i16_Hi:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1158,7 +1166,7 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
@@ -1166,10 +1174,10 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1208,18 +1216,19 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot2_v4i16_Hi:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] offset:4
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1282,7 +1291,7 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1,
; GFX8-LABEL: notudot2_v4i16_Even:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1293,7 +1302,7 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1,
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1301,10 +1310,10 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1346,21 +1355,22 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: notudot2_v4i16_Even:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1423,7 +1433,7 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1,
; GFX8-LABEL: notudot2_v4i16_Middle:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1434,7 +1444,7 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1,
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1442,10 +1452,10 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1487,21 +1497,22 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: notudot2_v4i16_Middle:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1563,7 +1574,7 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
; GFX8-LABEL: notudot2_DiffIndex:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1574,7 +1585,7 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -1582,10 +1593,10 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1627,21 +1638,22 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: notudot2_DiffIndex:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1704,7 +1716,7 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_MultipleUses_add1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1715,7 +1727,7 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -1723,11 +1735,11 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1773,14 +1785,15 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot2_MultipleUses_add1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -1788,9 +1801,9 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s2
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1855,7 +1868,7 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX8-LABEL: idot2_MultipleUses_add1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1866,7 +1879,7 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
@@ -1874,11 +1887,11 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_i32_i24 v1, v2, v1, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1924,14 +1937,15 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot2_MultipleUses_add1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_ashrrev_i32_e32 v0, 16, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -1939,9 +1953,9 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s2
+; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2006,7 +2020,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_MultipleUses_mul1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2017,7 +2031,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -2025,11 +2039,11 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v4, v2, v1, s2
+; GFX8-NEXT: v_mad_u32_u24 v4, v2, v1, s0
; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2079,14 +2093,15 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot2_MultipleUses_mul1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xffff, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2094,10 +2109,10 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s2
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, v2
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2163,7 +2178,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX8-LABEL: idot2_MultipleUses_mul1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2174,7 +2189,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
@@ -2182,11 +2197,11 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v4, v2, v1, s2
+; GFX8-NEXT: v_mad_i32_i24 v4, v2, v1, s0
; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4
; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2236,14 +2251,15 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot2_MultipleUses_mul1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 16
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2251,10 +2267,10 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mul_i32_i24_e32 v2, v3, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s2
+; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, v2
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2320,7 +2336,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_MultipleUses_mul2:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2331,7 +2347,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -2339,11 +2355,11 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v4, v0, v3, s2
+; GFX8-NEXT: v_mad_u32_u24 v4, v0, v3, s0
; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2391,14 +2407,15 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot2_MultipleUses_mul2:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2406,10 +2423,10 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s2
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2475,7 +2492,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX8-LABEL: idot2_MultipleUses_mul2:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2486,7 +2503,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
@@ -2494,11 +2511,11 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v4, v0, v3, s2
+; GFX8-NEXT: v_mad_i32_i24 v4, v0, v3, s0
; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4
; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2546,14 +2563,15 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot2_MultipleUses_mul2:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_ashrrev_i32_e32 v0, 16, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2561,10 +2579,10 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_i32_i24_e32 v2, v3, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s2
+; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2629,7 +2647,7 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_acc16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2640,8 +2658,8 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
@@ -2770,7 +2788,7 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
; GFX8-LABEL: notsdot2_sext8:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2781,7 +2799,7 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v3
@@ -2791,10 +2809,10 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2840,23 +2858,24 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: notsdot2_sext8:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_ushort v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc0c0001
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc0c0001
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0
-; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v2, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index fdd9138..c148ba3 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -45,7 +45,7 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -56,7 +56,7 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8
@@ -66,14 +66,14 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_i32 v7, v0, 16, 8
; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 24, v0
; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -115,36 +115,38 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -236,7 +238,7 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -247,8 +249,8 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_bfe_i32 v7, v3, 0, 8
@@ -344,16 +346,16 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_i16 v3, v1, s[0:1]
+; GFX11-DL-NEXT: global_load_i16 v3, v1, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v2, v0, v3 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -437,7 +439,7 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc8:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -448,8 +450,8 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
@@ -529,16 +531,16 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT: global_load_u8 v3, v1, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3
-; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b8 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -615,7 +617,7 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_multiuse_mul1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -626,7 +628,7 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8
@@ -636,15 +638,15 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v8, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v8, v1, v2, s0
; GFX8-NEXT: v_mad_i32_i24 v4, v4, v5, v8
; GFX8-NEXT: v_bfe_i32 v7, v0, 16, 8
; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, v4
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 24, v0
; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -695,45 +697,47 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_multiuse_mul1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_bfe_i32 v3, v2, 0, 8
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v3, s2
+; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v3, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_multiuse_mul1:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_bfe_i32 v2, v1, 0, 8
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_bfe_i32 v3, v0, 0, 8
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-DL-NEXT: v_mad_i32_i24 v2, v2, v3, s2
+; GFX11-DL-NEXT: v_mad_i32_i24 v2, v2, v3, s0
; GFX11-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, v2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v3, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -819,7 +823,7 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_vecMul:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -830,7 +834,7 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v3
; GFX8-NEXT: v_ashrrev_i32_e32 v4, 24, v3
@@ -844,12 +848,12 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s2
+; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s0
; GFX8-NEXT: v_mad_i32_i24 v0, v1, v2, v0
; GFX8-NEXT: v_mad_i32_i24 v0, v5, v7, v0
; GFX8-NEXT: v_mad_i32_i24 v2, v4, v6, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -894,36 +898,38 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_vecMul:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1001,7 +1007,7 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc16_vecMul:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1012,8 +1018,8 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
@@ -1111,15 +1117,16 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc16_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_ashrrev_i16 v4, 8, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
@@ -1145,20 +1152,21 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3
-; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc16_vecMul:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT: global_load_u16 v3, v2, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(2)
; GFX11-DL-NEXT: v_ashrrev_i16 v4, 8, v1
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
@@ -1190,7 +1198,7 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-DL-NEXT: v_add_nc_u16 v0, v1, v0
; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v3
-; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b16 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1254,7 +1262,7 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_2ele:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1265,7 +1273,7 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v3, v3, 8, 8
@@ -1273,10 +1281,10 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
; GFX8-NEXT: v_bfe_i32 v0, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1320,44 +1328,46 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_2ele:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc0c0100
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc0c0100
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0
-; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v2, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_2ele:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0100
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1425,7 +1435,7 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_3ele:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1436,7 +1446,7 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8
@@ -1445,12 +1455,12 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1498,44 +1508,46 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_3ele:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020100
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020100
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0
-; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v2, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_3ele:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020100
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020100
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1610,7 +1622,7 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_3ele_permuted:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1621,7 +1633,7 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 24, v3
; GFX8-NEXT: v_bfe_i32 v4, v3, 0, 8
@@ -1630,12 +1642,12 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 24, v0
; GFX8-NEXT: v_bfe_i32 v5, v0, 0, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1683,44 +1695,46 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_3ele_permuted:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020003
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020003
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0
-; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v2, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_3ele_permuted:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020003
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020003
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1795,7 +1809,7 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_opt:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1806,8 +1820,8 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v4, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v7, v3, 16, 8
@@ -1860,9 +1874,10 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_opt:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
@@ -1871,14 +1886,15 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_opt:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
@@ -1886,7 +1902,7 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, 0 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1970,22 +1986,22 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot4_acc32_3src:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v2, v3, 8, 8
@@ -2001,20 +2017,20 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 24, v0
; GFX8-NEXT: v_mad_i32_i24 v1, v5, v6, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: idot4_acc32_3src:
; GFX9-NODL: ; %bb.0: ; %entry
-; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5]
-; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9]
+; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
@@ -2026,20 +2042,20 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v2, v4, s0, v2
; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot4_acc32_3src:
; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s0, 0x706010c
; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0c00
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX9-DL-NEXT: s_load_dword s1, s[10:11], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0
@@ -2048,20 +2064,19 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_or_b32_e32 v1, v1, v2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v3, v1, s1
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot4_acc32_3src:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x2
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0x706010c
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2071,19 +2086,19 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v1, v3, v0
-; GFX10-DL-NEXT: global_store_dword v2, v1, s[6:7]
+; GFX10-DL-NEXT: global_store_dword v2, v1, s[10:11]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_3src:
; GFX11-DL: ; %bb.0: ; %entry
-; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x2
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
-; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[8:9]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT: s_load_b32 s0, s[10:11], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0x706010c
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2093,7 +2108,7 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[10:11]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2177,22 +2192,22 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot4_acc32_3src_3ele:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v2, v3, 8, 8
@@ -2205,20 +2220,20 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: idot4_acc32_3src_3ele:
; GFX9-NODL: ; %bb.0: ; %entry
-; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5]
-; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9]
+; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
; GFX9-NODL-NEXT: v_bfe_i32 v4, v1, 0, 8
@@ -2229,21 +2244,21 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v4, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v3, v2, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot4_acc32_3src_3ele:
; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s0, 0xc06010c
; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c00
; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020100
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX9-DL-NEXT: s_load_dword s3, s[10:11], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0
@@ -2253,20 +2268,19 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s3
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot4_acc32_3src_3ele:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x2
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc06010c
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2277,19 +2291,19 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v1, v2, v0
-; GFX10-DL-NEXT: global_store_dword v3, v1, s[6:7]
+; GFX10-DL-NEXT: global_store_dword v3, v1, s[10:11]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_3src_3ele:
; GFX11-DL: ; %bb.0: ; %entry
-; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x2
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
-; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[8:9]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT: s_load_b32 s0, s[10:11], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc06010c
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2300,7 +2314,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[10:11]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2378,9 +2392,9 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_bad_source:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x3c
+; GFX8-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -2390,122 +2404,122 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0
-; GFX8-NEXT: s_sext_i32_i16 s2, s2
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX8-NEXT: s_sext_i32_i16 s1, s8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v2, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8
-; GFX8-NEXT: v_mad_i32_i24 v1, v2, s2, v1
+; GFX8-NEXT: v_mad_i32_i24 v1, v2, s1, v1
; GFX8-NEXT: v_bfe_i32 v3, v3, 16, 8
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8
; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: idot4_bad_source:
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x3c
+; GFX9-NODL-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX9-NODL-NEXT: s_sext_i32_i16 s2, s2
+; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s8
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, s2, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, s1, v2
; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v4, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot4_bad_source:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x3c
+; GFX9-DL-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
+; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0201
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0201
-; GFX9-DL-NEXT: s_sext_i32_i16 s2, s2
-; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX9-DL-NEXT: s_sext_i32_i16 s4, s8
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_bfe_i32 v4, v1, 0, 8
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s4
-; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, s2, v3
-; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s4
+; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, s4, v3
+; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1
; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, v3
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot4_bad_source:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x3c
+; GFX10-DL-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2
-; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX10-DL-NEXT: s_sext_i32_i16 s1, s8
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0201
; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, s2, s3
+; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, s1, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_bad_source:
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x3c
+; GFX11-DL-NEXT: s_load_b32 s8, s[0:1], 0x3c
; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x44
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_sext_i32_i16 s2, s2
-; GFX11-DL-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX11-DL-NEXT: s_sext_i32_i16 s1, s8
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_bfe_i32 v2, v1, 0, 8
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0201
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT: v_mad_i32_i24 v2, v2, s2, s3
+; GFX11-DL-NEXT: v_mad_i32_i24 v2, v2, s1, s0
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, v2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v3, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2586,7 +2600,7 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_commutative:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2597,7 +2611,7 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8
@@ -2606,12 +2620,12 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2659,44 +2673,46 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_commutative:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020100
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020100
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0
-; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v2, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_commutative:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x3c
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020100
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020100
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2776,22 +2792,22 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot4_acc32_3src_3ele_src0:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_bfe_i32 v2, v3, 8, 8
; GFX8-NEXT: s_waitcnt vmcnt(1)
@@ -2803,20 +2819,20 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: idot4_acc32_3src_3ele_src0:
; GFX9-NODL: ; %bb.0: ; %entry
-; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX9-NODL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
; GFX9-NODL-NEXT: v_bfe_i32 v4, v1, 8, 8
@@ -2827,21 +2843,21 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v4, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v3, v2, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot4_acc32_3src_3ele_src0:
; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3]
-; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s0, 0xc06010c
; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c01
; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020101
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7]
+; GFX9-DL-NEXT: s_load_dword s3, s[10:11], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_perm_b32 v1, v1, v2, s0
@@ -2851,20 +2867,19 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s3
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot4_acc32_3src_3ele_src0:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x2
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3]
-; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v2, 0xc06010c
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2875,19 +2890,19 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v1, v2, v0
-; GFX10-DL-NEXT: global_store_dword v3, v1, s[6:7]
+; GFX10-DL-NEXT: global_store_dword v3, v1, s[10:11]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_3src_3ele_src0:
; GFX11-DL: ; %bb.0: ; %entry
-; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x2
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
-; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[8:9]
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT: s_load_b32 s0, s[10:11], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v2, 0xc06010c
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2898,7 +2913,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[10:11]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2986,7 +3001,7 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -3004,12 +3019,12 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s10, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(3)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v2, v3, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_bfe_i32 v3, v4, 0, 8
; GFX8-NEXT: v_bfe_i32 v4, v4, 8, 8
@@ -3022,8 +3037,8 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v7, v0, 0, 8
; GFX8-NEXT: v_bfe_i32 v0, v0, 8, 8
; GFX8-NEXT: v_mad_i32_i24 v2, v7, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -3031,14 +3046,14 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9]
; GFX9-NODL-NEXT: global_load_dword v4, v0, s[10:11]
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(3)
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
@@ -3048,52 +3063,52 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v4, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v2
+; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v2
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v4
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot4_4src:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
-; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0501
-; GFX9-DL-NEXT: s_mov_b32 s3, 0x5010c0c
+; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
+; GFX9-DL-NEXT: s_mov_b32 s0, 0xc0c0501
+; GFX9-DL-NEXT: s_mov_b32 s1, 0x5010c0c
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-DL-NEXT: global_load_dword v3, v0, s[8:9]
; GFX9-DL-NEXT: global_load_dword v4, v0, s[10:11]
; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0400
-; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0
+; GFX9-DL-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-DL-NEXT: s_mov_b32 s5, 0x4000c0c
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT: v_perm_b32 v5, v2, v1, s2
+; GFX9-DL-NEXT: v_perm_b32 v5, v2, v1, s0
; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s4
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_perm_b32 v6, v4, v3, s3
+; GFX9-DL-NEXT: v_perm_b32 v6, v4, v3, s1
; GFX9-DL-NEXT: v_perm_b32 v2, v4, v3, s5
; GFX9-DL-NEXT: v_or_b32_e32 v3, v6, v5
; GFX9-DL-NEXT: v_or_b32_e32 v1, v2, v1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v3, s6
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot4_4src:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x3
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v3, v0, s[8:9]
; GFX10-DL-NEXT: global_load_dword v4, v0, s[10:11]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc0c0501
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc0c0400
@@ -3104,23 +3119,23 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_or_b32_e32 v0, v5, v0
; GFX10-DL-NEXT: v_or_b32_e32 v1, v2, v1
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0
-; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v2, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_4src:
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x44
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x3
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v2, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v3, v0, s[8:9]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[10:11]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(2)
; GFX11-DL-NEXT: v_perm_b32 v4, v2, v1, 0xc0c0501
; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc0c0400
@@ -3133,8 +3148,8 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v2, s2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v2, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -3231,7 +3246,7 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_nonstandard_signed:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -3243,8 +3258,8 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
@@ -3327,10 +3342,11 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_nonstandard_signed:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v6, 0xff
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
@@ -3355,14 +3371,15 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mad_u16 v0, v1, v2, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_nonstandard_signed:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
@@ -3390,7 +3407,7 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-DL-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX11-DL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 0b131ea..86aab8c 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -45,7 +45,7 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_acc32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -56,7 +56,7 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
@@ -66,14 +66,14 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -115,34 +115,36 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot4_acc32:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_acc32:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -227,7 +229,7 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_acc16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -239,8 +241,8 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v3
@@ -329,16 +331,16 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_u16 v3, v1, s[0:1]
+; GFX11-DL-NEXT: global_load_u16 v3, v1, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3
-; GFX11-DL-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -423,7 +425,7 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_acc8:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -434,8 +436,8 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
@@ -515,16 +517,16 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT: global_load_u8 v3, v1, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3
-; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b8 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -595,7 +597,7 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_8:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -606,8 +608,8 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v3
@@ -684,14 +686,14 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_u8 v3, v2, s[0:1]
+; GFX11-DL-NEXT: global_load_u8 v3, v2, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(2)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
@@ -699,7 +701,7 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v3
-; GFX11-DL-NEXT: global_store_b8 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b8 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -765,7 +767,7 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_CommutationInsideMAD:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -776,8 +778,8 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
@@ -857,16 +859,16 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT: global_load_u8 v3, v1, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v3
-; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b8 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -943,7 +945,7 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_CommutationAccrossMADs:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -954,8 +956,8 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
@@ -1035,16 +1037,16 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT: global_load_u8 v3, v1, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v3
-; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b8 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1122,7 +1124,7 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_multiuse_mul1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1133,7 +1135,7 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
@@ -1143,15 +1145,15 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v8, v1, v2, s2
+; GFX8-NEXT: v_mad_u32_u24 v8, v1, v2, s0
; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v8
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1202,45 +1204,47 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot4_multiuse_mul1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xff, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v2
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s2
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v2, v0
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_multiuse_mul1:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_and_b32_e32 v2, 0xff, v1
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_and_b32_e32 v3, 0xff, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-DL-NEXT: v_mad_u32_u24 v2, v2, v3, s2
+; GFX11-DL-NEXT: v_mad_u32_u24 v2, v2, v3, s0
; GFX11-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v2
-; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v3, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1328,7 +1332,7 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_multiuse_add1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1339,7 +1343,7 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
@@ -1349,16 +1353,16 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, s2
+; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, s0
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v4
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v4
; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v5
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1408,46 +1412,48 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot4_multiuse_add1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2
-; GFX10-DL-NEXT: s_add_i32 s2, s2, s2
+; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0
+; GFX10-DL-NEXT: s_add_i32 s0, s0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-DL-NEXT: v_add3_u32 v0, s2, v0, v1
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_add3_u32 v0, s0, v0, v1
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_multiuse_add1:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_bfe_u32 v2, v1, 8, 8
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_bfe_u32 v3, v0, 8, 8
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
; GFX11-DL-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-DL-NEXT: s_add_i32 s2, s2, s2
+; GFX11-DL-NEXT: s_add_i32 s0, s0, s0
; GFX11-DL-NEXT: v_mul_u32_u24_e32 v2, v2, v3
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_add3_u32 v0, s2, v2, v0
-; GFX11-DL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT: v_add3_u32 v0, s0, v2, v0
+; GFX11-DL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1535,7 +1541,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
; GFX8-LABEL: notdot4_mixedtypes:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -1547,8 +1553,8 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3
@@ -1663,7 +1669,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
@@ -1678,7 +1684,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_bfe_i32 v7, v0, 0, 8
; GFX11-DL-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-DL-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT: global_load_u16 v3, v2, s[2:3]
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0302
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0302
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
@@ -1688,7 +1694,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v3
-; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b16 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1778,7 +1784,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
; GFX8-LABEL: notdot4_mixedtypes2:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -1790,8 +1796,8 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3
@@ -1920,7 +1926,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
@@ -1931,7 +1937,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_and_b32_e32 v9, 0xff, v0
-; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT: global_load_u16 v3, v2, s[2:3]
; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v0
; GFX11-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v1
; GFX11-DL-NEXT: v_and_b32_e32 v4, 0xff, v4
@@ -1950,7 +1956,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3
; GFX11-DL-NEXT: v_mad_u16 v0, v1, v0, v3
-; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b16 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2036,7 +2042,7 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_acc32_vecMul:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2047,7 +2053,7 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 8
@@ -2059,12 +2065,12 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b16_e32 v7, 8, v0
; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s0
; GFX8-NEXT: v_mad_u32_u24 v0, v5, v7, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v4, v6, v0
; GFX8-NEXT: v_mad_u32_u24 v2, v1, v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2106,34 +2112,36 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot4_acc32_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_acc32_vecMul:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2208,7 +2216,7 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_acc16_vecMul:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -2220,8 +2228,8 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3
@@ -2315,16 +2323,17 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot4_acc16_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0xff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_lshrrev_b16 v4, 8, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
@@ -2348,21 +2357,22 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3
-; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_acc16_vecMul:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v7, 0xff, v1
-; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT: global_load_u16 v3, v2, s[2:3]
; GFX11-DL-NEXT: v_lshrrev_b16 v4, 8, v1
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_lshrrev_b16 v5, 8, v0
@@ -2391,7 +2401,7 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_add_nc_u16 v0, v1, v0
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v3
-; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b16 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2462,7 +2472,7 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_acc8_vecMul:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2473,8 +2483,8 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
@@ -2554,15 +2564,16 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot4_acc8_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
@@ -2585,20 +2596,21 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v5
; GFX10-DL-NEXT: v_mad_u16 v1, v7, v8, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2
-; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1]
+; GFX10-DL-NEXT: global_store_byte v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_acc8_vecMul:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_u8 v3, v2, s[0:1]
+; GFX11-DL-NEXT: global_load_u8 v3, v2, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(2)
; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1
@@ -2630,7 +2642,7 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-DL-NEXT: v_mad_u16 v0, v4, v7, v0
; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v1
-; GFX11-DL-NEXT: global_store_b8 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b8 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2691,7 +2703,7 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_2ele:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2702,7 +2714,7 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v3, v3, 8, 8
@@ -2710,10 +2722,10 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
; GFX8-NEXT: v_bfe_u32 v0, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2757,43 +2769,45 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_2ele:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc0c0100
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc0c0100
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_2ele:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0100
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s2
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2860,7 +2874,7 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_3ele:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2871,7 +2885,7 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
@@ -2880,12 +2894,12 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2933,43 +2947,45 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_3ele:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020100
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020100
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_3ele:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020100
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020100
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s2
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -3043,7 +3059,7 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_3ele_permuted:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -3054,7 +3070,7 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v3
; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v3
@@ -3063,12 +3079,12 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0
; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -3116,43 +3132,45 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_3ele_permuted:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020003
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020003
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_3ele_permuted:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020003
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020003
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s2
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -3228,7 +3246,7 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_opt:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -3239,8 +3257,8 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 8
@@ -3293,9 +3311,10 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_opt:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
@@ -3303,14 +3322,15 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_opt:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
@@ -3318,7 +3338,7 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -3402,22 +3422,22 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot4_acc32_3src:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8
@@ -3433,20 +3453,20 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX8-NEXT: v_mad_u32_u24 v1, v5, v6, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: udot4_acc32_3src:
; GFX9-NODL: ; %bb.0: ; %entry
-; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5]
-; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9]
+; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
@@ -3458,20 +3478,20 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v2, v4, s0, v2
; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_acc32_3src:
; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s0, 0x706010c
; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0c00
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX9-DL-NEXT: s_load_dword s1, s[10:11], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0
@@ -3480,20 +3500,19 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_or_b32_e32 v1, v1, v2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v3, v1, s1
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_acc32_3src:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x2
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0x706010c
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -3502,19 +3521,19 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v0, s0
-; GFX10-DL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[10:11]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_acc32_3src:
; GFX11-DL: ; %bb.0: ; %entry
-; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x2
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
-; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[8:9]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT: s_load_b32 s0, s[10:11], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0x706010c
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
@@ -3524,7 +3543,7 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[10:11]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -3609,22 +3628,22 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot4_acc32_3src_3ele:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8
@@ -3637,20 +3656,20 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: udot4_acc32_3src_3ele:
; GFX9-NODL: ; %bb.0: ; %entry
-; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5]
-; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9]
+; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v1
@@ -3661,21 +3680,21 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v4, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v3, v2, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_acc32_3src_3ele:
; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s0, 0xc06010c
; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c00
; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020100
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX9-DL-NEXT: s_load_dword s3, s[10:11], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0
@@ -3685,20 +3704,19 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s3
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_acc32_3src_3ele:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x2
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc06010c
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -3708,19 +3726,19 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0xc020100
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[10:11]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_acc32_3src_3ele:
; GFX11-DL: ; %bb.0: ; %entry
-; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x2
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
-; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[8:9]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT: s_load_b32 s0, s[10:11], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc06010c
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
@@ -3731,7 +3749,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[10:11]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -3811,9 +3829,9 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_bad_source:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x3c
+; GFX8-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -3823,122 +3841,122 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0
-; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX8-NEXT: s_and_b32 s1, s8, 0xffff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
-; GFX8-NEXT: v_mad_u32_u24 v1, v2, s2, v1
+; GFX8-NEXT: v_mad_u32_u24 v1, v2, s1, v1
; GFX8-NEXT: v_bfe_u32 v3, v3, 16, 8
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: udot4_bad_source:
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x3c
+; GFX9-NODL-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX9-NODL-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX9-NODL-NEXT: s_and_b32 s1, s8, 0xffff
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, s2, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, s1, v2
; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v4, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_bad_source:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x3c
+; GFX9-DL-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
+; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0201
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0201
-; GFX9-DL-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX9-DL-NEXT: s_and_b32 s4, s8, 0xffff
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xff, v1
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s4
-; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, s2, v3
-; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s4
+; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, s4, v3
+; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1
; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_bad_source:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x3c
+; GFX10-DL-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX10-DL-NEXT: s_and_b32 s1, s8, 0xffff
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xff, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0201
; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, s2, s3
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, s1, s0
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v2, v0
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_bad_source:
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x3c
+; GFX11-DL-NEXT: s_load_b32 s8, s[0:1], 0x3c
; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x44
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-DL-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX11-DL-NEXT: s_and_b32 s1, s8, 0xffff
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_and_b32_e32 v2, 0xff, v1
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0201
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT: v_mad_u32_u24 v2, v2, s2, s3
+; GFX11-DL-NEXT: v_mad_u32_u24 v2, v2, s1, s0
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v2
-; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v3, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -4019,7 +4037,7 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_commutative:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -4030,7 +4048,7 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
@@ -4039,12 +4057,12 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -4092,43 +4110,45 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot4_commutative:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020100
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020100
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_commutative:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x3c
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020100
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020100
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s2
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -4208,22 +4228,22 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot4_acc32_3src_3ele_src0:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8
; GFX8-NEXT: s_waitcnt vmcnt(1)
@@ -4235,20 +4255,20 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: udot4_acc32_3src_3ele_src0:
; GFX9-NODL: ; %bb.0: ; %entry
-; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX9-NODL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
; GFX9-NODL-NEXT: v_bfe_u32 v4, v1, 8, 8
@@ -4259,21 +4279,21 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v4, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v3, v2, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_acc32_3src_3ele_src0:
; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3]
-; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s0, 0xc06010c
; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c01
; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020101
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7]
+; GFX9-DL-NEXT: s_load_dword s3, s[10:11], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_perm_b32 v1, v1, v2, s0
@@ -4283,20 +4303,19 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s3
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_acc32_3src_3ele_src0:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x2
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3]
-; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v2, 0xc06010c
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -4306,19 +4325,19 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0xc020101
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[10:11]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_acc32_3src_3ele_src0:
; GFX11-DL: ; %bb.0: ; %entry
-; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x2
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
-; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[8:9]
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT: s_load_b32 s0, s[10:11], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v2, 0xc06010c
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
@@ -4329,7 +4348,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[10:11]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -4417,7 +4436,7 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -4435,12 +4454,12 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s10, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(3)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX8-NEXT: v_bfe_u32 v4, v4, 8, 8
@@ -4453,8 +4472,8 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v7, 0xff, v0
; GFX8-NEXT: v_bfe_u32 v0, v0, 8, 8
; GFX8-NEXT: v_mad_u32_u24 v2, v7, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -4462,14 +4481,14 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9]
; GFX9-NODL-NEXT: global_load_dword v4, v0, s[10:11]
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(3)
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
@@ -4479,52 +4498,52 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v2
+; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v2
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v4
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_4src:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
-; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0501
-; GFX9-DL-NEXT: s_mov_b32 s3, 0x5010c0c
+; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
+; GFX9-DL-NEXT: s_mov_b32 s0, 0xc0c0501
+; GFX9-DL-NEXT: s_mov_b32 s1, 0x5010c0c
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-DL-NEXT: global_load_dword v3, v0, s[8:9]
; GFX9-DL-NEXT: global_load_dword v4, v0, s[10:11]
; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0400
-; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0
+; GFX9-DL-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-DL-NEXT: s_mov_b32 s5, 0x4000c0c
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT: v_perm_b32 v5, v2, v1, s2
+; GFX9-DL-NEXT: v_perm_b32 v5, v2, v1, s0
; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s4
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_perm_b32 v6, v4, v3, s3
+; GFX9-DL-NEXT: v_perm_b32 v6, v4, v3, s1
; GFX9-DL-NEXT: v_perm_b32 v2, v4, v3, s5
; GFX9-DL-NEXT: v_or_b32_e32 v3, v6, v5
; GFX9-DL-NEXT: v_or_b32_e32 v1, v2, v1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v3, s6
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_4src:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x3
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v3, v0, s[8:9]
; GFX10-DL-NEXT: global_load_dword v4, v0, s[10:11]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc0c0501
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc0c0400
@@ -4535,22 +4554,22 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_or_b32_e32 v1, v2, v1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_4src:
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x44
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x3
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v2, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v3, v0, s[8:9]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[10:11]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(2)
; GFX11-DL-NEXT: v_perm_b32 v4, v2, v1, 0xc0c0501
; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc0c0400
@@ -4563,8 +4582,8 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, s2
-; GFX11-DL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, s0
+; GFX11-DL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -4667,7 +4686,7 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_acc32_multi:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -4678,7 +4697,7 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT: flat_load_dword v2, v[2:3]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -4686,7 +4705,7 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8
; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v3, v3, v4, s2
+; GFX8-NEXT: v_mad_u32_u24 v3, v3, v4, s0
; GFX8-NEXT: v_and_b32_e32 v9, 0xff, v1
; GFX8-NEXT: v_mad_u32_u24 v3, v7, v8, v3
; GFX8-NEXT: v_bfe_u32 v11, v1, 16, 8
@@ -4702,8 +4721,8 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v1
; GFX8-NEXT: v_mad_u32_u24 v0, v10, v6, v0
; GFX8-NEXT: v_mad_u32_u24 v2, v1, v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -4762,37 +4781,39 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot4_acc32_multi:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX10-DL-NEXT: global_load_dword v3, v2, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v2, v1, v0, 0x6040200
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v4, v3, v3, 0x2000200
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v0, 0x7050301
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v2, v4, s2
+; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v2, v4, s0
; GFX10-DL-NEXT: v_perm_b32 v2, v3, v3, 0x3010301
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v1
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_acc32_multi:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b64 v[0:1], v2, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v2, v2, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v3, v1, v0, 0x6040200
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
@@ -4801,10 +4822,10 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_perm_b32 v2, v2, v2, 0x3010301
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v1, v3, v4, s2
+; GFX11-DL-NEXT: v_dot4_u32_u8 v1, v3, v4, s0
; GFX11-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v1
-; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v3, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -4915,7 +4936,7 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_hilo:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -4928,8 +4949,8 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8
@@ -4982,9 +5003,10 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_hilo:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4
@@ -4992,14 +5014,15 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_hilo:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
@@ -5007,7 +5030,7 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -5088,7 +5111,7 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_lohi:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -5101,8 +5124,8 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8
@@ -5160,9 +5183,10 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_lohi:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4
@@ -5173,14 +5197,15 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0x3020001
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_lohi:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
@@ -5192,7 +5217,7 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3020001
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -5273,7 +5298,7 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_hihi:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -5288,8 +5313,8 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v2
; GFX8-NEXT: v_bfe_u32 v7, v2, 8, 8
@@ -5347,9 +5372,10 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_hihi:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4
@@ -5360,14 +5386,15 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0x3010002
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_hihi:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
@@ -5379,7 +5406,7 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3010002
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -5456,7 +5483,7 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_v8i8:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s3
@@ -5474,8 +5501,8 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v5, v6, v2
; GFX8-NEXT: v_mad_u32_u24 v2, v0, v1, v2
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -5513,28 +5540,30 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_v8i8:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_v8i8:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: global_load_b64 v[0:1], v0, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -5617,7 +5646,7 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_v16i8:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -5630,8 +5659,8 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1,
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[1:2]
; GFX8-NEXT: flat_load_dword v4, v[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v4
@@ -5696,10 +5725,11 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_v16i8:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v5, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; kill: killed $vgpr5
; GFX10-DL-NEXT: ; kill: killed $vgpr4
; GFX10-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7
@@ -5712,15 +5742,16 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3020001
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_v16i8:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v1, 4, v0
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: global_load_b128 v[0:3], v1, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v4, s[6:7]
@@ -5731,7 +5762,7 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3020001
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -5814,10 +5845,10 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_v256i8:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX8-NEXT: s_movk_i32 s2, 0xfc
+; GFX8-NEXT: s_movk_i32 s0, 0xfc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s5
; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v1
@@ -5826,11 +5857,11 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8
@@ -5890,10 +5921,11 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_v256i8:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: global_load_dword v2, v1, s[6:7]
; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5] offset:252
@@ -5903,15 +5935,16 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0x1000302
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_v256i8:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 3, v0
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: global_load_b32 v1, v1, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5] offset:252
@@ -5921,7 +5954,7 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x1000302
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -5997,7 +6030,7 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_anyext:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -6008,17 +6041,17 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v1, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v1, v1, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_u32 v0, v0, 8, 8
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -6063,41 +6096,43 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_anyext:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc0c0500
; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_anyext:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v1, 0xc0c0500
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index 8c53d26..036965d 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -63,11 +63,15 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot8_acc32:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -77,11 +81,7 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4
; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4
@@ -93,7 +93,7 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4
; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4
; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
; GFX8-NEXT: v_bfe_i32 v9, v0, 12, 4
@@ -109,8 +109,8 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 28, v0
; GFX8-NEXT: v_mad_i32_i24 v1, v14, v15, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -189,44 +189,44 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX10-DL-XNACK: ; %bb.0: ; %entry
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s2
-; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s0
+; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc32:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
-; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s2
-; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s0
+; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -372,11 +372,16 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot8_acc16:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 12
-; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -386,14 +391,9 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3
; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3
@@ -599,20 +599,20 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-LABEL: idot8_acc16:
; GFX10-DL-XNACK: ; %bb.0: ; %entry
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[2:3]
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1
@@ -670,27 +670,26 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v5, v1
-; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc16:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
-; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
-; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1
@@ -748,7 +747,7 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -893,11 +892,16 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot8_acc8:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 12
-; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -907,14 +911,9 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3
; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3
@@ -1120,20 +1119,20 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-LABEL: idot8_acc8:
; GFX10-DL-XNACK: ; %bb.0: ; %entry
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v0, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v0, s[2:3]
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1
@@ -1191,27 +1190,26 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v5, v1
-; GFX10-DL-XNACK-NEXT: global_store_byte v0, v1, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_store_byte v0, v1, s[2:3]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc8:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
-; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
-; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v3, v2, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v3, v2, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1
@@ -1269,7 +1267,7 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NOXNACK-NEXT: global_store_byte v2, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_store_byte v2, v0, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1401,11 +1399,15 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot8_multiuses_mul1:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -1415,11 +1417,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4
; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4
@@ -1430,7 +1428,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v16, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v16, v1, v2, s0
; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4
; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, v16
; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4
@@ -1449,8 +1447,8 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mad_i32_i24 v1, v14, v15, v1
; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v16, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1560,18 +1558,18 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-XNACK: ; %bb.0: ; %entry
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v0, v1, 0, 4
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v3, v1, 4, 4
@@ -1585,7 +1583,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v9, v2, 12, 4
; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v5, v0, v7, s2
+; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v5, v0, v7, s0
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v6, v1, 16, 4
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v10, v2, 16, 4
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v11, v1, 20, 4
@@ -1605,25 +1603,25 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v1, v5
-; GFX10-DL-XNACK-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_multiuses_mul1:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
-; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v2, v1, 0, 4
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v3, v1, 4, 4
@@ -1637,7 +1635,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v9, v0, 12, 4
; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v5, v2, v7, s2
+; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v5, v2, v7, s0
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v1, 16, 4
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v10, v0, 16, 4
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v11, v1, 20, 4
@@ -1657,7 +1655,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v1, v2, v3, v4
; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v0, v1, v0, v5
-; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1788,11 +1786,15 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot8_acc32_vecMul:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -1802,11 +1804,7 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 28, v3
; GFX8-NEXT: v_bfe_i32 v2, v3, 24, 4
@@ -1826,7 +1824,7 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v15, v0, 4, 4
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s2
+; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s0
; GFX8-NEXT: v_mad_i32_i24 v0, v8, v15, v0
; GFX8-NEXT: v_mad_i32_i24 v0, v7, v14, v0
; GFX8-NEXT: v_mad_i32_i24 v0, v6, v13, v0
@@ -1834,8 +1832,8 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mad_i32_i24 v0, v4, v11, v0
; GFX8-NEXT: v_mad_i32_i24 v0, v2, v10, v0
; GFX8-NEXT: v_mad_i32_i24 v2, v1, v9, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1914,44 +1912,44 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-XNACK: ; %bb.0: ; %entry
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s2
-; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s0
+; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc32_vecMul:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
-; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s2
-; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s0
+; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2061,11 +2059,16 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot8_acc16_vecMul:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 12
-; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -2075,14 +2078,9 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3
; GFX8-NEXT: v_lshlrev_b16_sdwa v7, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
@@ -2315,19 +2313,19 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-XNACK: ; %bb.0: ; %entry
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[2:3]
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
@@ -2401,26 +2399,26 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v2, v1
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3
-; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc16_vecMul:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
-; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
@@ -2494,7 +2492,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v1, v0
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v3
-; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2604,11 +2602,16 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot8_acc8_vecMul:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 12
-; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -2618,14 +2621,9 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 20, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 28, v3
@@ -2890,19 +2888,19 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-XNACK: ; %bb.0: ; %entry
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0
; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v4, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v4, s[2:3]
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
@@ -2983,26 +2981,26 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6
; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v7, v14, v0
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v1
-; GFX10-DL-XNACK-NEXT: global_store_byte v4, v0, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_store_byte v4, v0, s[2:3]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
-; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v2, v4, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v2, v4, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
@@ -3083,7 +3081,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v14, v0
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1
-; GFX10-DL-NOXNACK-NEXT: global_store_byte v4, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_store_byte v4, v0, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index 3828fa5..f29908a 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -61,11 +61,15 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_acc32:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -75,11 +79,7 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3
; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4
@@ -99,7 +99,7 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4
; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s0
; GFX8-NEXT: v_mad_u32_u24 v0, v8, v15, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v7, v14, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v6, v13, v0
@@ -107,8 +107,8 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mad_u32_u24 v0, v4, v11, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v2, v10, v0
; GFX8-NEXT: v_mad_u32_u24 v2, v1, v9, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -187,22 +187,22 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -332,11 +332,15 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_acc16:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -346,13 +350,9 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4
@@ -650,11 +650,15 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_acc8:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -664,13 +668,9 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4
@@ -969,11 +969,15 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_acc4:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -983,13 +987,9 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3
@@ -1276,11 +1276,15 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_CommutationInsideMAD:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -1290,13 +1294,9 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3
@@ -1582,11 +1582,15 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_multiuses_mul1:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -1596,11 +1600,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3
; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4
@@ -1620,7 +1620,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4
; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v16, v3, v0, s2
+; GFX8-NEXT: v_mad_u32_u24 v16, v3, v0, s0
; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v16
; GFX8-NEXT: v_mad_u32_u24 v3, v8, v15, v16
; GFX8-NEXT: v_mad_u32_u24 v3, v7, v14, v3
@@ -1630,8 +1630,8 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v10, v3
; GFX8-NEXT: v_mad_u32_u24 v1, v1, v9, v2
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1741,18 +1741,18 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -1768,7 +1768,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4
; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 12, 4
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v13, v8, v9, s2
+; GFX10-DL-NEXT: v_mad_u32_u24 v13, v8, v9, s0
; GFX10-DL-NEXT: v_bfe_u32 v14, v2, 20, 4
; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 16, 4
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v11
@@ -1786,7 +1786,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v2
; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: v_add3_u32 v0, v3, v13, v0
-; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1916,11 +1916,15 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_acc32_vecMul:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -1930,11 +1934,7 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3
; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4
@@ -1954,7 +1954,7 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4
; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s0
; GFX8-NEXT: v_mad_u32_u24 v0, v8, v15, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v7, v14, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v6, v13, v0
@@ -1962,8 +1962,8 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mad_u32_u24 v0, v4, v11, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v2, v10, v0
; GFX8-NEXT: v_mad_u32_u24 v2, v1, v9, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2042,22 +2042,22 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2152,11 +2152,15 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_acc16_vecMul:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -2166,13 +2170,9 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4
@@ -2324,20 +2324,20 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-LABEL: udot8_acc16_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
@@ -2381,7 +2381,7 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3
-; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2475,11 +2475,15 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_acc8_vecMul:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -2489,13 +2493,9 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v3
; GFX8-NEXT: v_bfe_u32 v10, v3, 24, 4
@@ -2680,19 +2680,19 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[0:1]
+; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 12, 4
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
@@ -2743,7 +2743,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v8
; GFX10-DL-NEXT: v_mad_u16 v0, v10, v16, v0
; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1
-; GFX10-DL-NEXT: global_store_byte v4, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_byte v4, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2838,11 +2838,15 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_acc4_vecMul:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -2852,13 +2856,9 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3
@@ -3013,20 +3013,20 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-LABEL: udot8_acc4_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
@@ -3071,7 +3071,7 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3
; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1
-; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1]
+; GFX10-DL-NEXT: global_store_byte v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -3156,7 +3156,7 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
; GFX8-LABEL: udot8_variant1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -3167,7 +3167,7 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 15, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 4, 4
@@ -3187,7 +3187,7 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 28, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 28, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, s0
; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v1
; GFX8-NEXT: v_mad_u32_u24 v0, v5, v4, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v7, v6, v0
@@ -3195,8 +3195,8 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
; GFX8-NEXT: v_mad_u32_u24 v0, v11, v10, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v13, v12, v0
; GFX8-NEXT: v_mad_u32_u24 v2, v15, v14, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -3261,18 +3261,19 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
;
; GFX10-DL-LABEL: udot8_variant1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %v2addr,
ptr addrspace(1) %dst) {
diff --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll
index f7a0e29..66e54aa 100644
--- a/llvm/test/CodeGen/AMDGPU/imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm.ll
@@ -17,13 +17,13 @@ define amdgpu_kernel void @i64_imm_inline_lo(ptr addrspace(1) %out) {
;
; VI-LABEL: i64_imm_inline_lo:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 5
; VI-NEXT: v_mov_b32_e32 v1, 0x12345678
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
entry:
store i64 1311768464867721221, ptr addrspace(1) %out ; 0x1234567800000005
@@ -45,13 +45,13 @@ define amdgpu_kernel void @i64_imm_inline_hi(ptr addrspace(1) %out) {
;
; VI-LABEL: i64_imm_inline_hi:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0x12345678
; VI-NEXT: v_mov_b32_e32 v1, 5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
entry:
store i64 21780256376, ptr addrspace(1) %out ; 0x0000000512345678
@@ -72,13 +72,13 @@ define amdgpu_kernel void @store_imm_neg_0.0_i64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_imm_neg_0.0_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_bfrev_b32_e32 v1, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store i64 -9223372036854775808, ptr addrspace(1) %out
ret void
@@ -97,12 +97,12 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_neg_0.0_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_bfrev_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store i32 -2147483648, ptr addrspace(1) %out
ret void
@@ -121,12 +121,12 @@ define amdgpu_kernel void @store_inline_imm_0.0_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_0.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float 0.0, ptr addrspace(1) %out
ret void
@@ -145,12 +145,12 @@ define amdgpu_kernel void @store_imm_neg_0.0_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_imm_neg_0.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_bfrev_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float -0.0, ptr addrspace(1) %out
ret void
@@ -169,12 +169,12 @@ define amdgpu_kernel void @store_inline_imm_0.5_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_0.5_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0.5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float 0.5, ptr addrspace(1) %out
ret void
@@ -193,12 +193,12 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_m_0.5_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, -0.5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float -0.5, ptr addrspace(1) %out
ret void
@@ -217,12 +217,12 @@ define amdgpu_kernel void @store_inline_imm_1.0_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_1.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 1.0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float 1.0, ptr addrspace(1) %out
ret void
@@ -241,12 +241,12 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_m_1.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, -1.0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float -1.0, ptr addrspace(1) %out
ret void
@@ -265,12 +265,12 @@ define amdgpu_kernel void @store_inline_imm_2.0_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_2.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 2.0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float 2.0, ptr addrspace(1) %out
ret void
@@ -289,12 +289,12 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_m_2.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, -2.0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float -2.0, ptr addrspace(1) %out
ret void
@@ -313,12 +313,12 @@ define amdgpu_kernel void @store_inline_imm_4.0_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_4.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 4.0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float 4.0, ptr addrspace(1) %out
ret void
@@ -337,12 +337,12 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_m_4.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, -4.0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float -4.0, ptr addrspace(1) %out
ret void
@@ -361,12 +361,12 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_inv_2pi_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0.15915494
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float 0x3FC45F3060000000, ptr addrspace(1) %out
ret void
@@ -385,12 +385,12 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(ptr addrspace(1) %out)
;
; VI-LABEL: store_inline_imm_m_inv_2pi_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0xbe22f983
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float 0xBFC45F3060000000, ptr addrspace(1) %out
ret void
@@ -409,12 +409,12 @@ define amdgpu_kernel void @store_literal_imm_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_literal_imm_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0x45800000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float 4096.0, ptr addrspace(1) %out
ret void
@@ -434,13 +434,13 @@ define amdgpu_kernel void @add_inline_imm_0.0_f32(ptr addrspace(1) %out, float %
;
; VI-LABEL: add_inline_imm_0.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 0.0
store float %y, ptr addrspace(1) %out
@@ -461,13 +461,13 @@ define amdgpu_kernel void @add_inline_imm_0.5_f32(ptr addrspace(1) %out, float %
;
; VI-LABEL: add_inline_imm_0.5_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 0.5
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 0.5
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 0.5
store float %y, ptr addrspace(1) %out
@@ -488,13 +488,13 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(ptr addrspace(1) %out, flo
;
; VI-LABEL: add_inline_imm_neg_0.5_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, -0.5
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, -0.5
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, -0.5
store float %y, ptr addrspace(1) %out
@@ -515,13 +515,13 @@ define amdgpu_kernel void @add_inline_imm_1.0_f32(ptr addrspace(1) %out, float %
;
; VI-LABEL: add_inline_imm_1.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 1.0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 1.0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 1.0
store float %y, ptr addrspace(1) %out
@@ -542,13 +542,13 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(ptr addrspace(1) %out, flo
;
; VI-LABEL: add_inline_imm_neg_1.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, -1.0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, -1.0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, -1.0
store float %y, ptr addrspace(1) %out
@@ -569,13 +569,13 @@ define amdgpu_kernel void @add_inline_imm_2.0_f32(ptr addrspace(1) %out, float %
;
; VI-LABEL: add_inline_imm_2.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 2.0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 2.0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 2.0
store float %y, ptr addrspace(1) %out
@@ -596,13 +596,13 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(ptr addrspace(1) %out, flo
;
; VI-LABEL: add_inline_imm_neg_2.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, -2.0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, -2.0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, -2.0
store float %y, ptr addrspace(1) %out
@@ -623,13 +623,13 @@ define amdgpu_kernel void @add_inline_imm_4.0_f32(ptr addrspace(1) %out, float %
;
; VI-LABEL: add_inline_imm_4.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 4.0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 4.0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 4.0
store float %y, ptr addrspace(1) %out
@@ -650,13 +650,13 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(ptr addrspace(1) %out, flo
;
; VI-LABEL: add_inline_imm_neg_4.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, -4.0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, -4.0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, -4.0
store float %y, ptr addrspace(1) %out
@@ -684,20 +684,20 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(ptr addrspace(1) %out,
;
; VI-LABEL: commute_add_inline_imm_0.5_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f32_e32 v0, 0.5, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load float, ptr addrspace(1) %in
%y = fadd float %x, 0.5
@@ -726,20 +726,20 @@ define amdgpu_kernel void @commute_add_literal_f32(ptr addrspace(1) %out, ptr ad
;
; VI-LABEL: commute_add_literal_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f32_e32 v0, 0x44800000, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load float, ptr addrspace(1) %in
%y = fadd float %x, 1024.0
@@ -761,13 +761,13 @@ define amdgpu_kernel void @add_inline_imm_1_f32(ptr addrspace(1) %out, float %x)
;
; VI-LABEL: add_inline_imm_1_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 1
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 0x36a0000000000000
store float %y, ptr addrspace(1) %out
@@ -788,13 +788,13 @@ define amdgpu_kernel void @add_inline_imm_2_f32(ptr addrspace(1) %out, float %x)
;
; VI-LABEL: add_inline_imm_2_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 2
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 2
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 0x36b0000000000000
store float %y, ptr addrspace(1) %out
@@ -815,13 +815,13 @@ define amdgpu_kernel void @add_inline_imm_16_f32(ptr addrspace(1) %out, float %x
;
; VI-LABEL: add_inline_imm_16_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 16
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 16
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 0x36e0000000000000
store float %y, ptr addrspace(1) %out
@@ -843,14 +843,14 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f32(ptr addrspace(1) %out, float
;
; VI-LABEL: add_inline_imm_neg_1_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_i32 s4, s4, -1
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_add_i32 s0, s2, -1
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%xbc = bitcast float %x to i32
%y = add i32 %xbc, -1
@@ -874,14 +874,14 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f32(ptr addrspace(1) %out, float
;
; VI-LABEL: add_inline_imm_neg_2_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_i32 s4, s4, -2
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_add_i32 s0, s2, -2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%xbc = bitcast float %x to i32
%y = add i32 %xbc, -2
@@ -905,14 +905,14 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f32(ptr addrspace(1) %out, floa
;
; VI-LABEL: add_inline_imm_neg_16_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_i32 s4, s4, -16
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_add_i32 s0, s2, -16
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%xbc = bitcast float %x to i32
%y = add i32 %xbc, -16
@@ -935,13 +935,13 @@ define amdgpu_kernel void @add_inline_imm_63_f32(ptr addrspace(1) %out, float %x
;
; VI-LABEL: add_inline_imm_63_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 63
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 63
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 0x36ff800000000000
store float %y, ptr addrspace(1) %out
@@ -962,13 +962,13 @@ define amdgpu_kernel void @add_inline_imm_64_f32(ptr addrspace(1) %out, float %x
;
; VI-LABEL: add_inline_imm_64_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 64
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 64
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 0x3700000000000000
store float %y, ptr addrspace(1) %out
@@ -990,12 +990,12 @@ define amdgpu_kernel void @add_inline_imm_0.0_f64(ptr addrspace(1) %out, [8 x i3
; VI-LABEL: add_inline_imm_0.0_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0.0
store double %y, ptr addrspace(1) %out
@@ -1017,12 +1017,12 @@ define amdgpu_kernel void @add_inline_imm_0.5_f64(ptr addrspace(1) %out, [8 x i3
; VI-LABEL: add_inline_imm_0.5_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 0.5
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0.5
store double %y, ptr addrspace(1) %out
@@ -1044,12 +1044,12 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(ptr addrspace(1) %out, [8
; VI-LABEL: add_inline_imm_neg_0.5_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], -0.5
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, -0.5
store double %y, ptr addrspace(1) %out
@@ -1071,12 +1071,12 @@ define amdgpu_kernel void @add_inline_imm_1.0_f64(ptr addrspace(1) %out, [8 x i3
; VI-LABEL: add_inline_imm_1.0_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 1.0
store double %y, ptr addrspace(1) %out
@@ -1098,12 +1098,12 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(ptr addrspace(1) %out, [8
; VI-LABEL: add_inline_imm_neg_1.0_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], -1.0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, -1.0
store double %y, ptr addrspace(1) %out
@@ -1125,12 +1125,12 @@ define amdgpu_kernel void @add_inline_imm_2.0_f64(ptr addrspace(1) %out, [8 x i3
; VI-LABEL: add_inline_imm_2.0_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 2.0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 2.0
store double %y, ptr addrspace(1) %out
@@ -1152,12 +1152,12 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(ptr addrspace(1) %out, [8
; VI-LABEL: add_inline_imm_neg_2.0_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], -2.0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, -2.0
store double %y, ptr addrspace(1) %out
@@ -1179,12 +1179,12 @@ define amdgpu_kernel void @add_inline_imm_4.0_f64(ptr addrspace(1) %out, [8 x i3
; VI-LABEL: add_inline_imm_4.0_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 4.0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 4.0
store double %y, ptr addrspace(1) %out
@@ -1206,12 +1206,12 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(ptr addrspace(1) %out, [8
; VI-LABEL: add_inline_imm_neg_4.0_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], -4.0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, -4.0
store double %y, ptr addrspace(1) %out
@@ -1235,12 +1235,12 @@ define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(ptr addrspace(1) %out, [8
; VI-LABEL: add_inline_imm_inv_2pi_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 0.15915494309189532
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0x3fc45f306dc9c882
store double %y, ptr addrspace(1) %out
@@ -1264,14 +1264,14 @@ define amdgpu_kernel void @add_m_inv_2pi_f64(ptr addrspace(1) %out, [8 x i32], d
; VI-LABEL: add_m_inv_2pi_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882
; VI-NEXT: v_mov_b32_e32 v1, 0xbfc45f30
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0xbfc45f306dc9c882
store double %y, ptr addrspace(1) %out
@@ -1293,12 +1293,12 @@ define amdgpu_kernel void @add_inline_imm_1_f64(ptr addrspace(1) %out, [8 x i32]
; VI-LABEL: add_inline_imm_1_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0x0000000000000001
store double %y, ptr addrspace(1) %out
@@ -1320,12 +1320,12 @@ define amdgpu_kernel void @add_inline_imm_2_f64(ptr addrspace(1) %out, [8 x i32]
; VI-LABEL: add_inline_imm_2_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 2
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0x0000000000000002
store double %y, ptr addrspace(1) %out
@@ -1347,12 +1347,12 @@ define amdgpu_kernel void @add_inline_imm_16_f64(ptr addrspace(1) %out, [8 x i32
; VI-LABEL: add_inline_imm_16_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 16
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0x0000000000000010
store double %y, ptr addrspace(1) %out
@@ -1373,13 +1373,13 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f64(ptr addrspace(1) %out, [8 x
;
; VI-LABEL: add_inline_imm_neg_1_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, -1
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0xffffffffffffffff
store double %y, ptr addrspace(1) %out
@@ -1400,13 +1400,13 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f64(ptr addrspace(1) %out, [8 x
;
; VI-LABEL: add_inline_imm_neg_2_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, -2
; VI-NEXT: v_mov_b32_e32 v1, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0xfffffffffffffffe
store double %y, ptr addrspace(1) %out
@@ -1427,13 +1427,13 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f64(ptr addrspace(1) %out, [8 x
;
; VI-LABEL: add_inline_imm_neg_16_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, -16
; VI-NEXT: v_mov_b32_e32 v1, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0xfffffffffffffff0
store double %y, ptr addrspace(1) %out
@@ -1455,12 +1455,12 @@ define amdgpu_kernel void @add_inline_imm_63_f64(ptr addrspace(1) %out, [8 x i32
; VI-LABEL: add_inline_imm_63_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 63
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0x000000000000003F
store double %y, ptr addrspace(1) %out
@@ -1482,12 +1482,12 @@ define amdgpu_kernel void @add_inline_imm_64_f64(ptr addrspace(1) %out, [8 x i32
; VI-LABEL: add_inline_imm_64_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 64
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0x0000000000000040
store double %y, ptr addrspace(1) %out
@@ -1508,13 +1508,13 @@ define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_0.0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double 0.0, ptr addrspace(1) %out
ret void
@@ -1534,13 +1534,13 @@ define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(ptr addrspace(1) %out)
;
; VI-LABEL: store_literal_imm_neg_0.0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_bfrev_b32_e32 v1, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double -0.0, ptr addrspace(1) %out
ret void
@@ -1560,13 +1560,13 @@ define amdgpu_kernel void @store_inline_imm_0.5_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_0.5_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0x3fe00000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double 0.5, ptr addrspace(1) %out
ret void
@@ -1586,13 +1586,13 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_m_0.5_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0xbfe00000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double -0.5, ptr addrspace(1) %out
ret void
@@ -1612,13 +1612,13 @@ define amdgpu_kernel void @store_inline_imm_1.0_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_1.0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0x3ff00000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double 1.0, ptr addrspace(1) %out
ret void
@@ -1638,13 +1638,13 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_m_1.0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0xbff00000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double -1.0, ptr addrspace(1) %out
ret void
@@ -1664,13 +1664,13 @@ define amdgpu_kernel void @store_inline_imm_2.0_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_2.0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 2.0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double 2.0, ptr addrspace(1) %out
ret void
@@ -1690,13 +1690,13 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_m_2.0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, -2.0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double -2.0, ptr addrspace(1) %out
ret void
@@ -1716,13 +1716,13 @@ define amdgpu_kernel void @store_inline_imm_4.0_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_4.0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0x40100000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double 4.0, ptr addrspace(1) %out
ret void
@@ -1742,13 +1742,13 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_m_4.0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0xc0100000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double -4.0, ptr addrspace(1) %out
ret void
@@ -1768,13 +1768,13 @@ define amdgpu_kernel void @store_inv_2pi_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inv_2pi_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882
; VI-NEXT: v_mov_b32_e32 v1, 0x3fc45f30
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double 0x3fc45f306dc9c882, ptr addrspace(1) %out
ret void
@@ -1794,13 +1794,13 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(ptr addrspace(1) %out)
;
; VI-LABEL: store_inline_imm_m_inv_2pi_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882
; VI-NEXT: v_mov_b32_e32 v1, 0xbfc45f30
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double 0xbfc45f306dc9c882, ptr addrspace(1) %out
ret void
@@ -1820,13 +1820,13 @@ define amdgpu_kernel void @store_literal_imm_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_literal_imm_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0x40b00000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double 4096.0, ptr addrspace(1) %out
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
index 3cabe41..44e8ae0 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @extract_w_offset_vgpr(ptr addrspace(1) %out) {
; GCN-NEXT: liveins: $vgpr0, $sgpr2_sgpr3
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0
- ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr2_sgpr3, 36, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+ ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr2_sgpr3, 36, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
; GCN-NEXT: renamable $sgpr6 = COPY renamable $sgpr1
; GCN-NEXT: renamable $sgpr0 = COPY renamable $sgpr0, implicit killed $sgpr0_sgpr1
; GCN-NEXT: renamable $sgpr4 = S_MOV_B32 61440
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 18d5c05..2ecc51d 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -4,28 +4,28 @@
define amdgpu_kernel void @float4_inselt(ptr addrspace(1) %out, <4 x float> %vec, i32 %sel) {
; GCN-LABEL: float4_inselt:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x44
+; GCN-NEXT: s_load_dword s8, s[0:1], 0x44
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cmp_lg_u32 s2, 3
+; GCN-NEXT: s_cmp_lg_u32 s8, 3
; GCN-NEXT: v_mov_b32_e32 v0, s7
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 2
+; GCN-NEXT: s_cmp_lg_u32 s8, 2
; GCN-NEXT: v_cndmask_b32_e32 v3, 1.0, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 1
+; GCN-NEXT: s_cmp_lg_u32 s8, 1
; GCN-NEXT: v_cndmask_b32_e32 v2, 1.0, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NEXT: v_mov_b32_e32 v5, s3
; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -37,14 +37,14 @@ entry:
define amdgpu_kernel void @float4_inselt_undef(ptr addrspace(1) %out, i32 %sel) {
; GCN-LABEL: float4_inselt_undef:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 1.0
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: v_mov_b32_e32 v3, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -56,24 +56,24 @@ entry:
define amdgpu_kernel void @int4_inselt(ptr addrspace(1) %out, <4 x i32> %vec, i32 %sel) {
; GCN-LABEL: int4_inselt:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x44
+; GCN-NEXT: s_load_dword s8, s[0:1], 0x44
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cmp_lg_u32 s2, 3
-; GCN-NEXT: s_cselect_b32 s3, s7, 1
-; GCN-NEXT: s_cmp_lg_u32 s2, 2
-; GCN-NEXT: s_cselect_b32 s6, s6, 1
-; GCN-NEXT: s_cmp_lg_u32 s2, 1
+; GCN-NEXT: s_cmp_lg_u32 s8, 3
+; GCN-NEXT: s_cselect_b32 s0, s7, 1
+; GCN-NEXT: s_cmp_lg_u32 s8, 2
+; GCN-NEXT: s_cselect_b32 s1, s6, 1
+; GCN-NEXT: s_cmp_lg_u32 s8, 1
; GCN-NEXT: s_cselect_b32 s5, s5, 1
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
-; GCN-NEXT: s_cselect_b32 s2, s4, 1
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
+; GCN-NEXT: s_cselect_b32 s4, s4, 1
+; GCN-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: v_mov_b32_e32 v2, s6
-; GCN-NEXT: v_mov_b32_e32 v3, s3
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -110,27 +110,27 @@ define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec
; GCN-LABEL: float8_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x64
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dword s12, s[0:1], 0x64
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_mov_b32 m0, s2
-; GCN-NEXT: s_add_u32 s2, s0, 16
-; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: s_add_u32 s0, s2, 16
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v3, s7
; GCN-NEXT: v_mov_b32_e32 v4, s8
; GCN-NEXT: v_mov_b32_e32 v5, s9
; GCN-NEXT: v_mov_b32_e32 v6, s10
; GCN-NEXT: v_mov_b32_e32 v7, s11
-; GCN-NEXT: v_mov_b32_e32 v9, s3
+; GCN-NEXT: s_mov_b32 m0, s12
+; GCN-NEXT: v_mov_b32_e32 v9, s1
; GCN-NEXT: v_movreld_b32_e32 v0, 1.0
-; GCN-NEXT: v_mov_b32_e32 v8, s2
+; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -336,56 +336,56 @@ define amdgpu_kernel void @half8_inselt(ptr addrspace(1) %out, <8 x half> %vec,
; GCN-LABEL: half8_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x44
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dword s8, s[0:1], 0x44
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0x3c00
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s3, s7, 16
-; GCN-NEXT: s_cmp_lg_u32 s2, 7
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_lshr_b32 s0, s7, 16
+; GCN-NEXT: s_cmp_lg_u32 s8, 7
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 6
+; GCN-NEXT: s_cmp_lg_u32 s8, 6
; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s7
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s6, 16
+; GCN-NEXT: s_lshr_b32 s0, s6, 16
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
-; GCN-NEXT: s_cmp_lg_u32 s2, 5
+; GCN-NEXT: s_cmp_lg_u32 s8, 5
; GCN-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 4
+; GCN-NEXT: s_cmp_lg_u32 s8, 4
; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s5, 16
+; GCN-NEXT: s_lshr_b32 s0, s5, 16
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
-; GCN-NEXT: s_cmp_lg_u32 s2, 3
+; GCN-NEXT: s_cmp_lg_u32 s8, 3
; GCN-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 2
+; GCN-NEXT: s_cmp_lg_u32 s8, 2
; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v4, s5
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s4, 16
+; GCN-NEXT: s_lshr_b32 s0, s4, 16
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
-; GCN-NEXT: s_cmp_lg_u32 s2, 1
+; GCN-NEXT: s_cmp_lg_u32 s8, 1
; GCN-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v4, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
; GCN-NEXT: v_mov_b32_e32 v5, s4
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; GCN-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -468,98 +468,98 @@ define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec,
; GCN-LABEL: byte16_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x44
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dword s8, s[0:1], 0x44
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s3, s7, 24
-; GCN-NEXT: s_cmp_lg_u32 s2, 15
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_lshr_b32 s0, s7, 24
+; GCN-NEXT: s_cmp_lg_u32 s8, 15
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s7, 16
-; GCN-NEXT: s_cmp_lg_u32 s2, 14
+; GCN-NEXT: s_lshr_b32 s0, s7, 16
+; GCN-NEXT: s_cmp_lg_u32 s8, 14
; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s7, 8
+; GCN-NEXT: s_lshr_b32 s0, s7, 8
; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc
-; GCN-NEXT: s_cmp_lg_u32 s2, 13
+; GCN-NEXT: s_cmp_lg_u32 s8, 13
; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 12
+; GCN-NEXT: s_cmp_lg_u32 s8, 12
; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s7
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GCN-NEXT: v_cndmask_b32_e32 v2, 1, v2, vcc
-; GCN-NEXT: s_lshr_b32 s3, s6, 24
+; GCN-NEXT: s_lshr_b32 s0, s6, 24
; GCN-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT: s_cmp_lg_u32 s2, 11
+; GCN-NEXT: s_cmp_lg_u32 s8, 11
; GCN-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s6, 16
-; GCN-NEXT: s_cmp_lg_u32 s2, 10
+; GCN-NEXT: s_lshr_b32 s0, s6, 16
+; GCN-NEXT: s_cmp_lg_u32 s8, 10
; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s6, 8
+; GCN-NEXT: s_lshr_b32 s0, s6, 8
; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc
-; GCN-NEXT: s_cmp_lg_u32 s2, 9
+; GCN-NEXT: s_cmp_lg_u32 s8, 9
; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 8
+; GCN-NEXT: s_cmp_lg_u32 s8, 8
; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GCN-NEXT: v_cndmask_b32_e32 v2, 1, v2, vcc
-; GCN-NEXT: s_lshr_b32 s3, s5, 24
+; GCN-NEXT: s_lshr_b32 s0, s5, 24
; GCN-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT: s_cmp_lg_u32 s2, 7
+; GCN-NEXT: s_cmp_lg_u32 s8, 7
; GCN-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s5, 16
-; GCN-NEXT: s_cmp_lg_u32 s2, 6
+; GCN-NEXT: s_lshr_b32 s0, s5, 16
+; GCN-NEXT: s_cmp_lg_u32 s8, 6
; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s5, 8
+; GCN-NEXT: s_lshr_b32 s0, s5, 8
; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc
-; GCN-NEXT: s_cmp_lg_u32 s2, 5
+; GCN-NEXT: s_cmp_lg_u32 s8, 5
; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 4
+; GCN-NEXT: s_cmp_lg_u32 s8, 4
; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v4, s5
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc
-; GCN-NEXT: s_lshr_b32 s3, s4, 24
+; GCN-NEXT: s_lshr_b32 s0, s4, 24
; GCN-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT: s_cmp_lg_u32 s2, 3
+; GCN-NEXT: s_cmp_lg_u32 s8, 3
; GCN-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s4, 16
-; GCN-NEXT: s_cmp_lg_u32 s2, 2
+; GCN-NEXT: s_lshr_b32 s0, s4, 16
+; GCN-NEXT: s_cmp_lg_u32 s8, 2
; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT: v_mov_b32_e32 v4, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s4, 8
+; GCN-NEXT: s_lshr_b32 s0, s4, 8
; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc
-; GCN-NEXT: s_cmp_lg_u32 s2, 1
+; GCN-NEXT: s_cmp_lg_u32 s8, 1
; GCN-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v4, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc
; GCN-NEXT: v_mov_b32_e32 v5, s4
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
@@ -567,8 +567,8 @@ define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec,
; GCN-NEXT: v_cndmask_b32_e32 v5, 1, v5, vcc
; GCN-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GCN-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -580,22 +580,22 @@ entry:
define amdgpu_kernel void @double2_inselt(ptr addrspace(1) %out, <2 x double> %vec, i32 %sel) {
; GCN-LABEL: double2_inselt:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x44
+; GCN-NEXT: s_load_dword s8, s[0:1], 0x44
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cmp_eq_u32 s2, 1
-; GCN-NEXT: s_cselect_b32 s3, 0x3ff00000, s7
-; GCN-NEXT: s_cselect_b32 s6, 0, s6
-; GCN-NEXT: s_cmp_eq_u32 s2, 0
-; GCN-NEXT: s_cselect_b32 s2, 0x3ff00000, s5
+; GCN-NEXT: s_cmp_eq_u32 s8, 1
+; GCN-NEXT: s_cselect_b32 s0, 0x3ff00000, s7
+; GCN-NEXT: s_cselect_b32 s1, 0, s6
+; GCN-NEXT: s_cmp_eq_u32 s8, 0
+; GCN-NEXT: s_cselect_b32 s5, 0x3ff00000, s5
; GCN-NEXT: s_cselect_b32 s4, 0, s4
-; GCN-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NEXT: v_mov_b32_e32 v5, s3
; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mov_b32_e32 v1, s2
-; GCN-NEXT: v_mov_b32_e32 v2, s6
-; GCN-NEXT: v_mov_b32_e32 v3, s3
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -607,48 +607,48 @@ entry:
define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %vec, i32 %sel) {
; GCN-LABEL: double5_inselt:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s12, s[0:1], 0xa4
-; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x84
-; GCN-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24
-; GCN-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64
+; GCN-NEXT: s_load_dword s14, s[0:1], 0xa4
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x84
+; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cmp_eq_u32 s12, 4
-; GCN-NEXT: s_cselect_b32 s9, 0x3ff00000, s9
-; GCN-NEXT: s_cselect_b32 s8, 0, s8
-; GCN-NEXT: s_cmp_eq_u32 s12, 1
+; GCN-NEXT: s_cmp_eq_u32 s14, 4
; GCN-NEXT: s_cselect_b32 s3, 0x3ff00000, s3
; GCN-NEXT: s_cselect_b32 s2, 0, s2
-; GCN-NEXT: s_cmp_eq_u32 s12, 0
-; GCN-NEXT: s_cselect_b32 s13, 0x3ff00000, s1
-; GCN-NEXT: s_cselect_b32 s14, 0, s0
-; GCN-NEXT: s_cmp_eq_u32 s12, 3
-; GCN-NEXT: s_cselect_b32 s0, 0x3ff00000, s7
-; GCN-NEXT: s_cselect_b32 s1, 0, s6
-; GCN-NEXT: s_cmp_eq_u32 s12, 2
+; GCN-NEXT: s_cmp_eq_u32 s14, 1
+; GCN-NEXT: s_cselect_b32 s7, 0x3ff00000, s7
+; GCN-NEXT: s_cselect_b32 s6, 0, s6
+; GCN-NEXT: s_cmp_eq_u32 s14, 0
; GCN-NEXT: s_cselect_b32 s5, 0x3ff00000, s5
; GCN-NEXT: s_cselect_b32 s4, 0, s4
+; GCN-NEXT: s_cmp_eq_u32 s14, 3
+; GCN-NEXT: s_cselect_b32 s0, 0x3ff00000, s11
+; GCN-NEXT: s_cselect_b32 s1, 0, s10
+; GCN-NEXT: s_cmp_eq_u32 s14, 2
+; GCN-NEXT: s_cselect_b32 s9, 0x3ff00000, s9
+; GCN-NEXT: s_cselect_b32 s8, 0, s8
; GCN-NEXT: v_mov_b32_e32 v3, s0
-; GCN-NEXT: s_add_u32 s0, s10, 16
+; GCN-NEXT: s_add_u32 s0, s12, 16
; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: s_addc_u32 s1, s11, 0
+; GCN-NEXT: s_addc_u32 s1, s13, 0
; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NEXT: v_mov_b32_e32 v4, s10
-; GCN-NEXT: s_add_u32 s0, s10, 32
-; GCN-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NEXT: v_mov_b32_e32 v1, s13
-; GCN-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NEXT: v_mov_b32_e32 v3, s3
-; GCN-NEXT: v_mov_b32_e32 v5, s11
-; GCN-NEXT: s_addc_u32 s1, s11, 0
+; GCN-NEXT: v_mov_b32_e32 v4, s12
+; GCN-NEXT: s_add_u32 s0, s12, 32
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NEXT: v_mov_b32_e32 v3, s7
+; GCN-NEXT: v_mov_b32_e32 v5, s13
+; GCN-NEXT: s_addc_u32 s1, s13, 0
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v3, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_endpgm
@@ -661,12 +661,12 @@ entry:
define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %vec, i32 %sel) {
; GCN-LABEL: double8_inselt:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s2, s[0:1], 0xa4
+; GCN-NEXT: s_load_dword s20, s[0:1], 0xa4
; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v16, 0x3ff00000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s2, s2, 1
+; GCN-NEXT: s_lshl_b32 s0, s20, 1
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: v_mov_b32_e32 v2, s6
@@ -683,29 +683,29 @@ define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %v
; GCN-NEXT: v_mov_b32_e32 v13, s17
; GCN-NEXT: v_mov_b32_e32 v14, s18
; GCN-NEXT: v_mov_b32_e32 v15, s19
-; GCN-NEXT: s_mov_b32 m0, s2
-; GCN-NEXT: s_add_u32 s2, s0, 48
+; GCN-NEXT: s_mov_b32 m0, s0
+; GCN-NEXT: s_add_u32 s0, s2, 48
; GCN-NEXT: v_movreld_b32_e32 v0, 0
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: v_movreld_b32_e32 v1, v16
-; GCN-NEXT: v_mov_b32_e32 v17, s3
-; GCN-NEXT: v_mov_b32_e32 v16, s2
-; GCN-NEXT: s_add_u32 s2, s0, 32
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v17, s1
+; GCN-NEXT: v_mov_b32_e32 v16, s0
+; GCN-NEXT: s_add_u32 s0, s2, 32
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v13, s3
-; GCN-NEXT: v_mov_b32_e32 v12, s2
-; GCN-NEXT: s_add_u32 s2, s0, 16
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v13, s1
+; GCN-NEXT: v_mov_b32_e32 v12, s0
+; GCN-NEXT: s_add_u32 s0, s2, 16
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v9, s3
-; GCN-NEXT: v_mov_b32_e32 v8, s2
+; GCN-NEXT: v_mov_b32_e32 v9, s1
+; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -773,11 +773,12 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double>
; GCN-NEXT: s_load_dword s2, s[0:1], 0x124
; GCN-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4
; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0xe4
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s36
; GCN-NEXT: s_lshl_b32 s2, s2, 1
+; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, s37
; GCN-NEXT: v_mov_b32_e32 v2, s38
; GCN-NEXT: v_mov_b32_e32 v3, s39
@@ -809,53 +810,53 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double>
; GCN-NEXT: v_mov_b32_e32 v29, s17
; GCN-NEXT: v_mov_b32_e32 v30, s18
; GCN-NEXT: v_mov_b32_e32 v31, s19
-; GCN-NEXT: s_mov_b32 m0, s2
-; GCN-NEXT: s_add_u32 s2, s0, 0x70
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_add_u32 s0, s2, 0x70
; GCN-NEXT: v_movreld_b32_e32 v0, 0
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: v_movreld_b32_e32 v1, v32
-; GCN-NEXT: v_mov_b32_e32 v33, s3
-; GCN-NEXT: v_mov_b32_e32 v32, s2
-; GCN-NEXT: s_add_u32 s2, s0, 0x60
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v33, s1
+; GCN-NEXT: v_mov_b32_e32 v32, s0
+; GCN-NEXT: s_add_u32 s0, s2, 0x60
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[32:33], v[28:31]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v29, s3
-; GCN-NEXT: v_mov_b32_e32 v28, s2
-; GCN-NEXT: s_add_u32 s2, s0, 0x50
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v29, s1
+; GCN-NEXT: v_mov_b32_e32 v28, s0
+; GCN-NEXT: s_add_u32 s0, s2, 0x50
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[28:29], v[24:27]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v25, s3
-; GCN-NEXT: v_mov_b32_e32 v24, s2
-; GCN-NEXT: s_add_u32 s2, s0, 64
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v25, s1
+; GCN-NEXT: v_mov_b32_e32 v24, s0
+; GCN-NEXT: s_add_u32 s0, s2, 64
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[24:25], v[20:23]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v21, s3
-; GCN-NEXT: v_mov_b32_e32 v20, s2
-; GCN-NEXT: s_add_u32 s2, s0, 48
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v21, s1
+; GCN-NEXT: v_mov_b32_e32 v20, s0
+; GCN-NEXT: s_add_u32 s0, s2, 48
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v17, s3
-; GCN-NEXT: v_mov_b32_e32 v16, s2
-; GCN-NEXT: s_add_u32 s2, s0, 32
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v17, s1
+; GCN-NEXT: v_mov_b32_e32 v16, s0
+; GCN-NEXT: s_add_u32 s0, s2, 32
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v13, s3
-; GCN-NEXT: v_mov_b32_e32 v12, s2
-; GCN-NEXT: s_add_u32 s2, s0, 16
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v13, s1
+; GCN-NEXT: v_mov_b32_e32 v12, s0
+; GCN-NEXT: s_add_u32 s0, s2, 16
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v9, s3
-; GCN-NEXT: v_mov_b32_e32 v8, s2
+; GCN-NEXT: v_mov_b32_e32 v9, s1
+; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -875,12 +876,14 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double>
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_load_dword s4, s[0:1], 0x124
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v28, s2
+; GCN-NEXT: v_mov_b32_e32 v29, s3
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_lshl_b32 s2, s4, 1
+; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v3, s7
; GCN-NEXT: v_mov_b32_e32 v4, s8
; GCN-NEXT: v_mov_b32_e32 v5, s9
@@ -906,49 +909,48 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double>
; GCN-NEXT: v_mov_b32_e32 v25, s21
; GCN-NEXT: v_mov_b32_e32 v26, s22
; GCN-NEXT: v_mov_b32_e32 v27, s23
-; GCN-NEXT: v_mov_b32_e32 v29, s3
-; GCN-NEXT: s_mov_b32 m0, s2
; GCN-NEXT: v_movreld_b32_e32 v0, 0
-; GCN-NEXT: s_add_u32 s2, s0, 0x50
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_add_u32 s0, s2, 0x50
; GCN-NEXT: v_movreld_b32_e32 v1, v32
-; GCN-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NEXT: v_mov_b32_e32 v31, s3
-; GCN-NEXT: v_mov_b32_e32 v30, s2
-; GCN-NEXT: s_add_u32 s2, s0, 64
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NEXT: v_mov_b32_e32 v31, s1
+; GCN-NEXT: v_mov_b32_e32 v30, s0
+; GCN-NEXT: s_add_u32 s0, s2, 64
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[30:31], v[20:23]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v21, s3
-; GCN-NEXT: v_mov_b32_e32 v20, s2
-; GCN-NEXT: s_add_u32 s2, s0, 48
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v21, s1
+; GCN-NEXT: v_mov_b32_e32 v20, s0
+; GCN-NEXT: s_add_u32 s0, s2, 48
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v17, s3
-; GCN-NEXT: v_mov_b32_e32 v16, s2
-; GCN-NEXT: s_add_u32 s2, s0, 32
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v17, s1
+; GCN-NEXT: v_mov_b32_e32 v16, s0
+; GCN-NEXT: s_add_u32 s0, s2, 32
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v13, s3
-; GCN-NEXT: v_mov_b32_e32 v12, s2
-; GCN-NEXT: s_add_u32 s2, s0, 16
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v13, s1
+; GCN-NEXT: v_mov_b32_e32 v12, s0
+; GCN-NEXT: s_add_u32 s0, s2, 16
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v9, s3
-; GCN-NEXT: v_mov_b32_e32 v8, s2
+; GCN-NEXT: v_mov_b32_e32 v9, s1
+; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
-; GCN-NEXT: s_add_u32 s2, s0, 0x70
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: s_add_u32 s0, s2, 0x70
+; GCN-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_add_u32 s0, s0, 0x60
+; GCN-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: s_add_u32 s0, s2, 0x60
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[28:29]
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 68427e8..eb7c587 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1741,20 +1741,20 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p
; VI-LABEL: s_dynamic_insertelement_v8i8:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT: s_load_dword s8, s[4:5], 0x10
+; VI-NEXT: s_load_dword s10, s[4:5], 0x10
; VI-NEXT: s_mov_b32 s7, 0x1100f000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0
; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_lshl_b32 s0, s8, 3
+; VI-NEXT: s_lshl_b32 s0, s10, 3
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_lshl_b64 s[0:1], 0xff, s0
-; VI-NEXT: s_and_b32 s9, s1, 0x5050505
+; VI-NEXT: s_and_b32 s3, s1, 0x5050505
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
-; VI-NEXT: s_and_b32 s8, s0, 0x5050505
-; VI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3]
+; VI-NEXT: s_andn2_b64 s[8:9], s[8:9], s[0:1]
+; VI-NEXT: s_and_b32 s2, s0, 0x5050505
+; VI-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
index 1313460..e351b6d 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
@@ -907,12 +907,12 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB7_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
@@ -936,12 +936,12 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: .LBB7_2:
; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_readfirstlane_b32 s2, v1
+; GFX90A-NEXT: v_readfirstlane_b32 s0, v1
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
-; GFX90A-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX90A-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX90A-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX90A-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_endpgm
;
@@ -966,12 +966,12 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX10-NEXT: .LBB7_2:
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10-NEXT: v_readfirstlane_b32 s0, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
@@ -995,12 +995,12 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-FLATSCR-NEXT: .LBB7_2:
; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-FLATSCR-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX9-FLATSCR-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-FLATSCR-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX9-FLATSCR-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: s_endpgm
;
@@ -1025,13 +1025,13 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: .LBB7_2:
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11-NEXT: v_readfirstlane_b32 s0, v1
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_endpgm
;
@@ -1056,13 +1056,13 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: .LBB7_2:
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12-NEXT: v_readfirstlane_b32 s0, v1
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX12-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_endpgm
%val = atomicrmw volatile add ptr addrspace(3) %local, i32 5 seq_cst
@@ -1095,12 +1095,12 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: .LBB8_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
@@ -1123,12 +1123,12 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: .LBB8_2:
; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_readfirstlane_b32 s2, v1
+; GFX90A-NEXT: v_readfirstlane_b32 s0, v1
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
-; GFX90A-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX90A-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX90A-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX90A-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_endpgm
;
@@ -1151,12 +1151,12 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10-NEXT: .LBB8_2:
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10-NEXT: v_readfirstlane_b32 s0, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
@@ -1179,12 +1179,12 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: .LBB8_2:
; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-FLATSCR-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX9-FLATSCR-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-FLATSCR-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX9-FLATSCR-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: s_endpgm
;
@@ -1208,13 +1208,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: .LBB8_2:
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11-NEXT: v_readfirstlane_b32 s0, v1
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_endpgm
;
@@ -1238,13 +1238,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: .LBB8_2:
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12-NEXT: v_readfirstlane_b32 s0, v1
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX12-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index a344128..aab7b57 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -20,13 +20,13 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw
;
; VI-LABEL: i8_arg:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0xff
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0xff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -92,13 +92,13 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe
;
; VI-LABEL: i8_zext_arg:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0xff
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0xff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -167,13 +167,13 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe
;
; VI-LABEL: i8_sext_arg:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_sext_i32_i8 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_sext_i32_i8 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -242,13 +242,13 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou
;
; VI-LABEL: i16_arg:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0xffff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -314,13 +314,13 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer
;
; VI-LABEL: i16_zext_arg:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0xffff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -389,13 +389,13 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig
;
; VI-LABEL: i16_sext_arg:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_sext_i32_i16 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_sext_i32_i16 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -858,18 +858,18 @@ define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %i
;
; VI-LABEL: v3i8_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s3, s2, 16
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_add_u32 s0, s0, 2
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_lshr_b32 s1, s4, 16
+; VI-NEXT: s_add_u32 s0, s2, 2
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: flat_store_byte v[2:3], v5
; VI-NEXT: flat_store_short v[0:1], v4
; VI-NEXT: s_endpgm
@@ -1118,13 +1118,13 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32>
; VI-LABEL: v3i32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-NEXT: s_endpgm
;
@@ -1197,13 +1197,13 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float
; VI-LABEL: v3f32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-NEXT: s_endpgm
;
@@ -1396,15 +1396,15 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32>
;
; VI-LABEL: v4i32_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -1470,15 +1470,15 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float
;
; VI-LABEL: v4f32_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -1688,19 +1688,19 @@ define amdgpu_kernel void @v5i16_arg(ptr addrspace(1) nocapture %out, <5 x i16>
; VI-LABEL: v5i16_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT: s_load_dword s5, s[0:1], 0x3c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dword s6, s[0:1], 0x3c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s4, s2, 8
-; VI-NEXT: v_mov_b32_e32 v4, s5
-; VI-NEXT: s_addc_u32 s5, s3, 0
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: s_add_u32 s0, s2, 8
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v4, s6
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: flat_store_short v[2:3], v4
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
@@ -1920,22 +1920,22 @@ define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32>
;
; VI-LABEL: v5i32_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT: s_load_dword s7, s[0:1], 0x54
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dword s8, s[0:1], 0x54
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s6, s4, 16
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_addc_u32 s7, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v2, s8
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -2018,22 +2018,22 @@ define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float
;
; VI-LABEL: v5f32_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT: s_load_dword s7, s[0:1], 0x54
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dword s8, s[0:1], 0x54
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s6, s4, 16
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s7, s5, 0
-; VI-NEXT: v_mov_b32_e32 v1, s6
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s8
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: flat_store_dword v[1:2], v3
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -2124,32 +2124,32 @@ define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64>
;
; VI-LABEL: v5i64_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x84
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x84
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s12, s8, 32
-; VI-NEXT: v_mov_b32_e32 v1, s10
-; VI-NEXT: s_addc_u32 s13, s9, 0
-; VI-NEXT: v_mov_b32_e32 v3, s12
-; VI-NEXT: v_mov_b32_e32 v2, s11
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s13
-; VI-NEXT: s_add_u32 s4, s8, 16
+; VI-NEXT: s_add_u32 s0, s2, 32
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v1, s12
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: v_mov_b32_e32 v2, s13
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_addc_u32 s5, s9, 0
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v4, s8
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_mov_b32_e32 v5, s9
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -2266,32 +2266,32 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
;
; VI-LABEL: v5f64_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x84
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x84
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s12, s8, 32
-; VI-NEXT: v_mov_b32_e32 v1, s10
-; VI-NEXT: s_addc_u32 s13, s9, 0
-; VI-NEXT: v_mov_b32_e32 v3, s12
-; VI-NEXT: v_mov_b32_e32 v2, s11
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s13
-; VI-NEXT: s_add_u32 s4, s8, 16
+; VI-NEXT: s_add_u32 s0, s2, 32
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v1, s12
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: v_mov_b32_e32 v2, s13
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_addc_u32 s5, s9, 0
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v4, s8
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_mov_b32_e32 v5, s9
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -2649,15 +2649,15 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) {
;
; VI-LABEL: v8i16_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -2904,23 +2904,23 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
; VI-LABEL: v8i32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: s_add_u32 s2, s0, 16
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -3015,23 +3015,23 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float
; VI-LABEL: v8f32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: s_add_u32 s2, s0, 16
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -3120,15 +3120,15 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
;
; VI-LABEL: v16i8_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -3577,23 +3577,23 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) {
; VI-LABEL: v16i16_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: s_add_u32 s2, s0, 16
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -4045,41 +4045,41 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32
; VI-LABEL: v16i32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: s_add_u32 s2, s0, 48
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: s_add_u32 s2, s0, 32
+; VI-NEXT: s_add_u32 s0, s2, 48
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_add_u32 s0, s2, 32
; VI-NEXT: v_mov_b32_e32 v1, s17
; VI-NEXT: v_mov_b32_e32 v2, s18
; VI-NEXT: v_mov_b32_e32 v3, s19
-; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s3
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: s_add_u32 s2, s0, 16
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_add_u32 s0, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: v_mov_b32_e32 v1, s13
; VI-NEXT: v_mov_b32_e32 v2, s14
; VI-NEXT: v_mov_b32_e32 v3, s15
-; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -4233,41 +4233,41 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo
; VI-LABEL: v16f32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: s_add_u32 s2, s0, 48
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: s_add_u32 s2, s0, 32
+; VI-NEXT: s_add_u32 s0, s2, 48
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_add_u32 s0, s2, 32
; VI-NEXT: v_mov_b32_e32 v1, s17
; VI-NEXT: v_mov_b32_e32 v2, s18
; VI-NEXT: v_mov_b32_e32 v3, s19
-; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s3
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: s_add_u32 s2, s0, 16
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_add_u32 s0, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: v_mov_b32_e32 v1, s13
; VI-NEXT: v_mov_b32_e32 v2, s14
; VI-NEXT: v_mov_b32_e32 v3, s15
-; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -4401,12 +4401,12 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin
;
; VI-LABEL: kernel_arg_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
@@ -4463,12 +4463,12 @@ define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) {
;
; VI-LABEL: f64_kernel_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
@@ -4652,13 +4652,13 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind {
;
; VI-LABEL: i1_arg:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -4743,13 +4743,13 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin
;
; VI-LABEL: i1_arg_zext_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -4816,14 +4816,14 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin
;
; VI-LABEL: i1_arg_zext_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 1
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_and_b32 s0, s4, 1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -4891,13 +4891,13 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin
;
; VI-LABEL: i1_arg_sext_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_i32 s2, s2, 0x10000
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_bfe_i32 s0, s4, 0x10000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -4967,13 +4967,13 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin
; VI-LABEL: i1_arg_sext_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x10000
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
@@ -5089,25 +5089,25 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32,
;
; VI-LABEL: struct_argument_alignment:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x24
+; VI-NEXT: s_load_dword s6, s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; VI-NEXT: s_load_dword s5, s[0:1], 0x3c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; VI-NEXT: s_load_dword s7, s[0:1], 0x3c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x44
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
@@ -5254,14 +5254,14 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_load_dword s2, s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x28
+; VI-NEXT: s_load_dword s4, s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x28
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v7, s2
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v7, s4
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dword v[2:3], v7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
@@ -5413,32 +5413,32 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8,
;
; VI-LABEL: struct_argument_alignment_after:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s8, s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
-; VI-NEXT: s_load_dword s9, s[0:1], 0x3c
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
+; VI-NEXT: s_load_dword s10, s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; VI-NEXT: s_load_dword s11, s[0:1], 0x3c
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x54
; VI-NEXT: v_mov_b32_e32 v4, 0
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v0, s10
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s9
+; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
@@ -5902,12 +5902,12 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu
; VI-LABEL: byref_align_constant_i32_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x124
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x124
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
diff --git a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
index 142a6ed..1f14da1 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
@@ -80,25 +80,25 @@ define amdgpu_kernel void @lds_ds_fmin(ptr addrspace(5) %out, ptr addrspace(3) %
; VI: ; %bb.0:
; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s90, -1
; VI-NEXT: s_mov_b32 s91, 0xe80000
; VI-NEXT: s_add_u32 s88, s88, s3
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_addc_u32 s89, s89, 0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s3, s2, 3
; VI-NEXT: v_mov_b32_e32 v0, 0x42280000
-; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshl_b32 s0, s4, 3
+; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: ds_min_rtn_f32 v1, v1, v0 offset:32
-; VI-NEXT: s_lshl_b32 s2, s2, 4
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_lshl_b32 s0, s4, 4
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: ds_min_f32 v2, v0 offset:64
-; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: s_waitcnt lgkmcnt(1)
; VI-NEXT: ds_min_rtn_f32 v0, v0, v1
-; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
; VI-NEXT: s_endpgm
@@ -159,20 +159,20 @@ define amdgpu_kernel void @lds_ds_fmin(ptr addrspace(5) %out, ptr addrspace(3) %
; GFX11-LABEL: lds_ds_fmin:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshl_b32 s3, s2, 3
+; GFX11-NEXT: s_lshl_b32 s0, s4, 3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0x42280000 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: s_lshl_b32 s2, s2, 4
-; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_dual_mov_b32 v0, 0x42280000 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: s_lshl_b32 s0, s4, 4
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: ds_min_rtn_f32 v1, v1, v0 offset:32
; GFX11-NEXT: ds_min_f32 v2, v0 offset:64
; GFX11-NEXT: s_waitcnt lgkmcnt(1)
; GFX11-NEXT: ds_min_rtn_f32 v0, v3, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-NEXT: scratch_store_b32 off, v0, s2
; GFX11-NEXT: s_endpgm
;
; G_SI-LABEL: lds_ds_fmin:
@@ -235,26 +235,26 @@ define amdgpu_kernel void @lds_ds_fmin(ptr addrspace(5) %out, ptr addrspace(3) %
; G_VI: ; %bb.0:
; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; G_VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; G_VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; G_VI-NEXT: s_mov_b32 s90, -1
; G_VI-NEXT: s_mov_b32 s91, 0xe80000
; G_VI-NEXT: s_add_u32 s88, s88, s3
+; G_VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; G_VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; G_VI-NEXT: s_addc_u32 s89, s89, 0
-; G_VI-NEXT: s_waitcnt lgkmcnt(0)
-; G_VI-NEXT: s_add_i32 s2, s2, 4
-; G_VI-NEXT: s_lshl_b32 s3, s2, 3
; G_VI-NEXT: v_mov_b32_e32 v0, 0x42280000
-; G_VI-NEXT: v_mov_b32_e32 v1, s3
; G_VI-NEXT: s_mov_b32 m0, -1
+; G_VI-NEXT: s_waitcnt lgkmcnt(0)
+; G_VI-NEXT: s_add_i32 s4, s4, 4
+; G_VI-NEXT: s_lshl_b32 s0, s4, 3
+; G_VI-NEXT: v_mov_b32_e32 v1, s0
; G_VI-NEXT: ds_min_rtn_f32 v1, v1, v0
-; G_VI-NEXT: s_lshl_b32 s2, s2, 4
-; G_VI-NEXT: v_mov_b32_e32 v2, s2
+; G_VI-NEXT: s_lshl_b32 s0, s4, 4
+; G_VI-NEXT: v_mov_b32_e32 v2, s0
; G_VI-NEXT: ds_min_f32 v2, v0
-; G_VI-NEXT: v_mov_b32_e32 v0, s1
+; G_VI-NEXT: v_mov_b32_e32 v0, s3
; G_VI-NEXT: s_waitcnt lgkmcnt(1)
; G_VI-NEXT: ds_min_rtn_f32 v0, v0, v1
-; G_VI-NEXT: v_mov_b32_e32 v1, s0
+; G_VI-NEXT: v_mov_b32_e32 v1, s2
; G_VI-NEXT: s_waitcnt lgkmcnt(0)
; G_VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
; G_VI-NEXT: s_endpgm
@@ -289,49 +289,48 @@ define amdgpu_kernel void @lds_ds_fmin(ptr addrspace(5) %out, ptr addrspace(3) %
; G_GFX10-LABEL: lds_ds_fmin:
; G_GFX10: ; %bb.0:
; G_GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
-; G_GFX10-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; G_GFX10-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; G_GFX10-NEXT: s_mov_b32 s6, -1
-; G_GFX10-NEXT: s_mov_b32 s7, 0x31c16000
-; G_GFX10-NEXT: s_add_u32 s4, s4, s3
-; G_GFX10-NEXT: s_addc_u32 s5, s5, 0
-; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; G_GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; G_GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; G_GFX10-NEXT: s_mov_b32 s10, -1
+; G_GFX10-NEXT: s_mov_b32 s11, 0x31c16000
+; G_GFX10-NEXT: s_add_u32 s8, s8, s3
+; G_GFX10-NEXT: s_addc_u32 s9, s9, 0
; G_GFX10-NEXT: v_mov_b32_e32 v1, 0x42280000
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT: s_add_i32 s2, s2, 4
-; G_GFX10-NEXT: s_lshl_b32 s3, s2, 3
-; G_GFX10-NEXT: s_lshl_b32 s2, s2, 4
-; G_GFX10-NEXT: v_mov_b32_e32 v0, s3
-; G_GFX10-NEXT: v_mov_b32_e32 v2, s2
-; G_GFX10-NEXT: v_mov_b32_e32 v3, s1
+; G_GFX10-NEXT: s_add_i32 s4, s2, 4
+; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; G_GFX10-NEXT: s_lshl_b32 s5, s4, 3
+; G_GFX10-NEXT: s_lshl_b32 s0, s4, 4
+; G_GFX10-NEXT: v_mov_b32_e32 v0, s5
+; G_GFX10-NEXT: v_mov_b32_e32 v2, s0
; G_GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1
+; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT: v_mov_b32_e32 v3, s3
; G_GFX10-NEXT: ds_min_f32 v2, v1
-; G_GFX10-NEXT: s_waitcnt lgkmcnt(1)
; G_GFX10-NEXT: ds_min_rtn_f32 v0, v3, v0
-; G_GFX10-NEXT: v_mov_b32_e32 v1, s0
+; G_GFX10-NEXT: v_mov_b32_e32 v1, s2
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
+; G_GFX10-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
; G_GFX10-NEXT: s_endpgm
;
; G_GFX11-LABEL: lds_ds_fmin:
; G_GFX11: ; %bb.0:
-; G_GFX11-NEXT: s_clause 0x1
; G_GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; G_GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; G_GFX11-NEXT: v_mov_b32_e32 v1, 0x42280000
; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT: s_add_i32 s2, s2, 4
-; G_GFX11-NEXT: v_mov_b32_e32 v3, s1
-; G_GFX11-NEXT: s_lshl_b32 s3, s2, 3
-; G_GFX11-NEXT: s_lshl_b32 s2, s2, 4
-; G_GFX11-NEXT: v_mov_b32_e32 v0, s3
-; G_GFX11-NEXT: v_mov_b32_e32 v2, s2
+; G_GFX11-NEXT: s_add_i32 s4, s2, 4
+; G_GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; G_GFX11-NEXT: s_lshl_b32 s5, s4, 3
+; G_GFX11-NEXT: s_lshl_b32 s0, s4, 4
+; G_GFX11-NEXT: v_mov_b32_e32 v0, s5
+; G_GFX11-NEXT: v_mov_b32_e32 v2, s0
; G_GFX11-NEXT: ds_min_rtn_f32 v0, v0, v1
+; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX11-NEXT: v_mov_b32_e32 v3, s3
; G_GFX11-NEXT: ds_min_f32 v2, v1
-; G_GFX11-NEXT: s_waitcnt lgkmcnt(1)
; G_GFX11-NEXT: ds_min_rtn_f32 v0, v3, v0
; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT: scratch_store_b32 off, v0, s0
+; G_GFX11-NEXT: scratch_store_b32 off, v0, s2
; G_GFX11-NEXT: s_endpgm
%idx.add = add nuw i32 %idx, 4
%shl0 = shl i32 %idx.add, 3
@@ -406,25 +405,25 @@ define amdgpu_kernel void @lds_ds_fmax(ptr addrspace(5) %out, ptr addrspace(3) %
; VI: ; %bb.0:
; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s90, -1
; VI-NEXT: s_mov_b32 s91, 0xe80000
; VI-NEXT: s_add_u32 s88, s88, s3
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_addc_u32 s89, s89, 0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s3, s2, 3
; VI-NEXT: v_mov_b32_e32 v0, 0x42280000
-; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshl_b32 s0, s4, 3
+; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: ds_max_rtn_f32 v1, v1, v0 offset:32
-; VI-NEXT: s_lshl_b32 s2, s2, 4
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_lshl_b32 s0, s4, 4
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: ds_max_f32 v2, v0 offset:64
-; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: s_waitcnt lgkmcnt(1)
; VI-NEXT: ds_max_rtn_f32 v0, v0, v1
-; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
; VI-NEXT: s_endpgm
@@ -485,20 +484,20 @@ define amdgpu_kernel void @lds_ds_fmax(ptr addrspace(5) %out, ptr addrspace(3) %
; GFX11-LABEL: lds_ds_fmax:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshl_b32 s3, s2, 3
+; GFX11-NEXT: s_lshl_b32 s0, s4, 3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0x42280000 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: s_lshl_b32 s2, s2, 4
-; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_dual_mov_b32 v0, 0x42280000 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: s_lshl_b32 s0, s4, 4
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: ds_max_rtn_f32 v1, v1, v0 offset:32
; GFX11-NEXT: ds_max_f32 v2, v0 offset:64
; GFX11-NEXT: s_waitcnt lgkmcnt(1)
; GFX11-NEXT: ds_max_rtn_f32 v0, v3, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-NEXT: scratch_store_b32 off, v0, s2
; GFX11-NEXT: s_endpgm
;
; G_SI-LABEL: lds_ds_fmax:
@@ -561,26 +560,26 @@ define amdgpu_kernel void @lds_ds_fmax(ptr addrspace(5) %out, ptr addrspace(3) %
; G_VI: ; %bb.0:
; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; G_VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; G_VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; G_VI-NEXT: s_mov_b32 s90, -1
; G_VI-NEXT: s_mov_b32 s91, 0xe80000
; G_VI-NEXT: s_add_u32 s88, s88, s3
+; G_VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; G_VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; G_VI-NEXT: s_addc_u32 s89, s89, 0
-; G_VI-NEXT: s_waitcnt lgkmcnt(0)
-; G_VI-NEXT: s_add_i32 s2, s2, 4
-; G_VI-NEXT: s_lshl_b32 s3, s2, 3
; G_VI-NEXT: v_mov_b32_e32 v0, 0x42280000
-; G_VI-NEXT: v_mov_b32_e32 v1, s3
; G_VI-NEXT: s_mov_b32 m0, -1
+; G_VI-NEXT: s_waitcnt lgkmcnt(0)
+; G_VI-NEXT: s_add_i32 s4, s4, 4
+; G_VI-NEXT: s_lshl_b32 s0, s4, 3
+; G_VI-NEXT: v_mov_b32_e32 v1, s0
; G_VI-NEXT: ds_max_rtn_f32 v1, v1, v0
-; G_VI-NEXT: s_lshl_b32 s2, s2, 4
-; G_VI-NEXT: v_mov_b32_e32 v2, s2
+; G_VI-NEXT: s_lshl_b32 s0, s4, 4
+; G_VI-NEXT: v_mov_b32_e32 v2, s0
; G_VI-NEXT: ds_max_f32 v2, v0
-; G_VI-NEXT: v_mov_b32_e32 v0, s1
+; G_VI-NEXT: v_mov_b32_e32 v0, s3
; G_VI-NEXT: s_waitcnt lgkmcnt(1)
; G_VI-NEXT: ds_max_rtn_f32 v0, v0, v1
-; G_VI-NEXT: v_mov_b32_e32 v1, s0
+; G_VI-NEXT: v_mov_b32_e32 v1, s2
; G_VI-NEXT: s_waitcnt lgkmcnt(0)
; G_VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
; G_VI-NEXT: s_endpgm
@@ -615,49 +614,48 @@ define amdgpu_kernel void @lds_ds_fmax(ptr addrspace(5) %out, ptr addrspace(3) %
; G_GFX10-LABEL: lds_ds_fmax:
; G_GFX10: ; %bb.0:
; G_GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
-; G_GFX10-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; G_GFX10-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; G_GFX10-NEXT: s_mov_b32 s6, -1
-; G_GFX10-NEXT: s_mov_b32 s7, 0x31c16000
-; G_GFX10-NEXT: s_add_u32 s4, s4, s3
-; G_GFX10-NEXT: s_addc_u32 s5, s5, 0
-; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; G_GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; G_GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; G_GFX10-NEXT: s_mov_b32 s10, -1
+; G_GFX10-NEXT: s_mov_b32 s11, 0x31c16000
+; G_GFX10-NEXT: s_add_u32 s8, s8, s3
+; G_GFX10-NEXT: s_addc_u32 s9, s9, 0
; G_GFX10-NEXT: v_mov_b32_e32 v1, 0x42280000
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT: s_add_i32 s2, s2, 4
-; G_GFX10-NEXT: s_lshl_b32 s3, s2, 3
-; G_GFX10-NEXT: s_lshl_b32 s2, s2, 4
-; G_GFX10-NEXT: v_mov_b32_e32 v0, s3
-; G_GFX10-NEXT: v_mov_b32_e32 v2, s2
-; G_GFX10-NEXT: v_mov_b32_e32 v3, s1
+; G_GFX10-NEXT: s_add_i32 s4, s2, 4
+; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; G_GFX10-NEXT: s_lshl_b32 s5, s4, 3
+; G_GFX10-NEXT: s_lshl_b32 s0, s4, 4
+; G_GFX10-NEXT: v_mov_b32_e32 v0, s5
+; G_GFX10-NEXT: v_mov_b32_e32 v2, s0
; G_GFX10-NEXT: ds_max_rtn_f32 v0, v0, v1
+; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT: v_mov_b32_e32 v3, s3
; G_GFX10-NEXT: ds_max_f32 v2, v1
-; G_GFX10-NEXT: s_waitcnt lgkmcnt(1)
; G_GFX10-NEXT: ds_max_rtn_f32 v0, v3, v0
-; G_GFX10-NEXT: v_mov_b32_e32 v1, s0
+; G_GFX10-NEXT: v_mov_b32_e32 v1, s2
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
+; G_GFX10-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
; G_GFX10-NEXT: s_endpgm
;
; G_GFX11-LABEL: lds_ds_fmax:
; G_GFX11: ; %bb.0:
-; G_GFX11-NEXT: s_clause 0x1
; G_GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; G_GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; G_GFX11-NEXT: v_mov_b32_e32 v1, 0x42280000
; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT: s_add_i32 s2, s2, 4
-; G_GFX11-NEXT: v_mov_b32_e32 v3, s1
-; G_GFX11-NEXT: s_lshl_b32 s3, s2, 3
-; G_GFX11-NEXT: s_lshl_b32 s2, s2, 4
-; G_GFX11-NEXT: v_mov_b32_e32 v0, s3
-; G_GFX11-NEXT: v_mov_b32_e32 v2, s2
+; G_GFX11-NEXT: s_add_i32 s4, s2, 4
+; G_GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; G_GFX11-NEXT: s_lshl_b32 s5, s4, 3
+; G_GFX11-NEXT: s_lshl_b32 s0, s4, 4
+; G_GFX11-NEXT: v_mov_b32_e32 v0, s5
+; G_GFX11-NEXT: v_mov_b32_e32 v2, s0
; G_GFX11-NEXT: ds_max_rtn_f32 v0, v0, v1
+; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX11-NEXT: v_mov_b32_e32 v3, s3
; G_GFX11-NEXT: ds_max_f32 v2, v1
-; G_GFX11-NEXT: s_waitcnt lgkmcnt(1)
; G_GFX11-NEXT: ds_max_rtn_f32 v0, v3, v0
; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT: scratch_store_b32 off, v0, s0
+; G_GFX11-NEXT: scratch_store_b32 off, v0, s2
; G_GFX11-NEXT: s_endpgm
%idx.add = add nuw i32 %idx, 4
%shl0 = shl i32 %idx.add, 3
@@ -740,28 +738,28 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
; VI: ; %bb.0:
; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s90, -1
; VI-NEXT: s_mov_b32 s91, 0xe80000
; VI-NEXT: s_add_u32 s88, s88, s3
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_addc_u32 s89, s89, 0
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s3, s2, 3
; VI-NEXT: v_mov_b32_e32 v1, 0x40450000
-; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshl_b32 s0, s4, 3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; VI-NEXT: s_lshl_b32 s2, s2, 4
-; VI-NEXT: v_mov_b32_e32 v5, s2
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: s_lshl_b32 s0, s4, 4
+; VI-NEXT: v_mov_b32_e32 v5, s0
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: ds_min_f64 v5, v[0:1] offset:64
; VI-NEXT: s_waitcnt lgkmcnt(1)
; VI-NEXT: ds_min_rtn_f64 v[0:1], v4, v[2:3]
-; VI-NEXT: s_add_i32 s1, s0, 4
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_add_i32 s0, s2, 4
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
; VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
@@ -827,22 +825,22 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
; GFX11-LABEL: lds_ds_fmin_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshl_b32 s3, s2, 3
-; GFX11-NEXT: v_mov_b32_e32 v5, s1
-; GFX11-NEXT: v_dual_mov_b32 v1, 0x40450000 :: v_dual_mov_b32 v2, s3
-; GFX11-NEXT: s_lshl_b32 s2, s2, 4
+; GFX11-NEXT: s_lshl_b32 s0, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v5, s3
+; GFX11-NEXT: v_dual_mov_b32 v1, 0x40450000 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: s_lshl_b32 s0, s4, 4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, s2
+; GFX11-NEXT: v_mov_b32_e32 v4, s0
; GFX11-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
; GFX11-NEXT: ds_min_f64 v4, v[0:1] offset:64
; GFX11-NEXT: s_waitcnt lgkmcnt(1)
; GFX11-NEXT: ds_min_rtn_f64 v[0:1], v5, v[2:3]
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: scratch_store_b64 off, v[0:1], s0
+; GFX11-NEXT: scratch_store_b64 off, v[0:1], s2
; GFX11-NEXT: s_endpgm
;
; G_SI-LABEL: lds_ds_fmin_f64:
@@ -917,30 +915,30 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
; G_VI: ; %bb.0:
; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; G_VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; G_VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; G_VI-NEXT: s_mov_b32 s90, -1
; G_VI-NEXT: s_mov_b32 s91, 0xe80000
; G_VI-NEXT: s_add_u32 s88, s88, s3
-; G_VI-NEXT: s_mov_b32 s2, 0
+; G_VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; G_VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; G_VI-NEXT: s_mov_b32 s0, 0
; G_VI-NEXT: s_addc_u32 s89, s89, 0
-; G_VI-NEXT: s_mov_b32 s3, 0x40450000
+; G_VI-NEXT: s_mov_b32 s1, 0x40450000
; G_VI-NEXT: s_waitcnt lgkmcnt(0)
; G_VI-NEXT: s_add_i32 s4, s4, 4
-; G_VI-NEXT: v_mov_b32_e32 v0, s2
-; G_VI-NEXT: s_lshl_b32 s2, s4, 3
-; G_VI-NEXT: v_mov_b32_e32 v1, s3
-; G_VI-NEXT: v_mov_b32_e32 v2, s2
+; G_VI-NEXT: v_mov_b32_e32 v0, s0
+; G_VI-NEXT: s_lshl_b32 s0, s4, 3
+; G_VI-NEXT: v_mov_b32_e32 v1, s1
+; G_VI-NEXT: v_mov_b32_e32 v2, s0
; G_VI-NEXT: s_mov_b32 m0, -1
; G_VI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1]
-; G_VI-NEXT: s_lshl_b32 s2, s4, 4
-; G_VI-NEXT: v_mov_b32_e32 v4, s2
+; G_VI-NEXT: s_lshl_b32 s0, s4, 4
+; G_VI-NEXT: v_mov_b32_e32 v4, s0
; G_VI-NEXT: ds_min_f64 v4, v[0:1]
-; G_VI-NEXT: v_mov_b32_e32 v0, s1
+; G_VI-NEXT: v_mov_b32_e32 v0, s3
; G_VI-NEXT: s_waitcnt lgkmcnt(1)
; G_VI-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3]
-; G_VI-NEXT: v_mov_b32_e32 v2, s0
-; G_VI-NEXT: s_add_u32 s0, s0, 4
+; G_VI-NEXT: v_mov_b32_e32 v2, s2
+; G_VI-NEXT: s_add_u32 s0, s2, 4
; G_VI-NEXT: v_mov_b32_e32 v3, s0
; G_VI-NEXT: s_waitcnt lgkmcnt(0)
; G_VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
@@ -1013,24 +1011,25 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
; G_GFX11-LABEL: lds_ds_fmin_f64:
; G_GFX11: ; %bb.0:
; G_GFX11-NEXT: s_clause 0x1
-; G_GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; G_GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; G_GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; G_GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; G_GFX11-NEXT: s_mov_b32 s0, 0
+; G_GFX11-NEXT: s_mov_b32 s1, 0x40450000
+; G_GFX11-NEXT: v_mov_b32_e32 v0, s0
; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT: s_add_i32 s4, s2, 4
-; G_GFX11-NEXT: s_mov_b32 s2, 0
-; G_GFX11-NEXT: s_mov_b32 s3, 0x40450000
+; G_GFX11-NEXT: s_add_i32 s4, s4, 4
+; G_GFX11-NEXT: v_mov_b32_e32 v5, s3
; G_GFX11-NEXT: s_lshl_b32 s5, s4, 3
-; G_GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s1
-; G_GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s5
-; G_GFX11-NEXT: s_lshl_b32 s2, s4, 4
-; G_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; G_GFX11-NEXT: v_mov_b32_e32 v4, s2
+; G_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; G_GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s5
+; G_GFX11-NEXT: s_lshl_b32 s0, s4, 4
+; G_GFX11-NEXT: v_mov_b32_e32 v4, s0
; G_GFX11-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1]
; G_GFX11-NEXT: ds_min_f64 v4, v[0:1]
; G_GFX11-NEXT: s_waitcnt lgkmcnt(1)
; G_GFX11-NEXT: ds_min_rtn_f64 v[0:1], v5, v[2:3]
; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT: scratch_store_b64 off, v[0:1], s0
+; G_GFX11-NEXT: scratch_store_b64 off, v[0:1], s2
; G_GFX11-NEXT: s_endpgm
%idx.add = add nuw i32 %idx, 4
%shl0 = shl i32 %idx.add, 3
@@ -1113,28 +1112,28 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
; VI: ; %bb.0:
; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s90, -1
; VI-NEXT: s_mov_b32 s91, 0xe80000
; VI-NEXT: s_add_u32 s88, s88, s3
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_addc_u32 s89, s89, 0
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s3, s2, 3
; VI-NEXT: v_mov_b32_e32 v1, 0x40450000
-; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshl_b32 s0, s4, 3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; VI-NEXT: s_lshl_b32 s2, s2, 4
-; VI-NEXT: v_mov_b32_e32 v5, s2
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: s_lshl_b32 s0, s4, 4
+; VI-NEXT: v_mov_b32_e32 v5, s0
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: ds_max_f64 v5, v[0:1] offset:64
; VI-NEXT: s_waitcnt lgkmcnt(1)
; VI-NEXT: ds_max_rtn_f64 v[0:1], v4, v[2:3]
-; VI-NEXT: s_add_i32 s1, s0, 4
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_add_i32 s0, s2, 4
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
; VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
@@ -1200,22 +1199,22 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
; GFX11-LABEL: lds_ds_fmax_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshl_b32 s3, s2, 3
-; GFX11-NEXT: v_mov_b32_e32 v5, s1
-; GFX11-NEXT: v_dual_mov_b32 v1, 0x40450000 :: v_dual_mov_b32 v2, s3
-; GFX11-NEXT: s_lshl_b32 s2, s2, 4
+; GFX11-NEXT: s_lshl_b32 s0, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v5, s3
+; GFX11-NEXT: v_dual_mov_b32 v1, 0x40450000 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: s_lshl_b32 s0, s4, 4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, s2
+; GFX11-NEXT: v_mov_b32_e32 v4, s0
; GFX11-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
; GFX11-NEXT: ds_max_f64 v4, v[0:1] offset:64
; GFX11-NEXT: s_waitcnt lgkmcnt(1)
; GFX11-NEXT: ds_max_rtn_f64 v[0:1], v5, v[2:3]
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: scratch_store_b64 off, v[0:1], s0
+; GFX11-NEXT: scratch_store_b64 off, v[0:1], s2
; GFX11-NEXT: s_endpgm
;
; G_SI-LABEL: lds_ds_fmax_f64:
@@ -1290,30 +1289,30 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
; G_VI: ; %bb.0:
; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; G_VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; G_VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; G_VI-NEXT: s_mov_b32 s90, -1
; G_VI-NEXT: s_mov_b32 s91, 0xe80000
; G_VI-NEXT: s_add_u32 s88, s88, s3
-; G_VI-NEXT: s_mov_b32 s2, 0
+; G_VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; G_VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; G_VI-NEXT: s_mov_b32 s0, 0
; G_VI-NEXT: s_addc_u32 s89, s89, 0
-; G_VI-NEXT: s_mov_b32 s3, 0x40450000
+; G_VI-NEXT: s_mov_b32 s1, 0x40450000
; G_VI-NEXT: s_waitcnt lgkmcnt(0)
; G_VI-NEXT: s_add_i32 s4, s4, 4
-; G_VI-NEXT: v_mov_b32_e32 v0, s2
-; G_VI-NEXT: s_lshl_b32 s2, s4, 3
-; G_VI-NEXT: v_mov_b32_e32 v1, s3
-; G_VI-NEXT: v_mov_b32_e32 v2, s2
+; G_VI-NEXT: v_mov_b32_e32 v0, s0
+; G_VI-NEXT: s_lshl_b32 s0, s4, 3
+; G_VI-NEXT: v_mov_b32_e32 v1, s1
+; G_VI-NEXT: v_mov_b32_e32 v2, s0
; G_VI-NEXT: s_mov_b32 m0, -1
; G_VI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1]
-; G_VI-NEXT: s_lshl_b32 s2, s4, 4
-; G_VI-NEXT: v_mov_b32_e32 v4, s2
+; G_VI-NEXT: s_lshl_b32 s0, s4, 4
+; G_VI-NEXT: v_mov_b32_e32 v4, s0
; G_VI-NEXT: ds_max_f64 v4, v[0:1]
-; G_VI-NEXT: v_mov_b32_e32 v0, s1
+; G_VI-NEXT: v_mov_b32_e32 v0, s3
; G_VI-NEXT: s_waitcnt lgkmcnt(1)
; G_VI-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3]
-; G_VI-NEXT: v_mov_b32_e32 v2, s0
-; G_VI-NEXT: s_add_u32 s0, s0, 4
+; G_VI-NEXT: v_mov_b32_e32 v2, s2
+; G_VI-NEXT: s_add_u32 s0, s2, 4
; G_VI-NEXT: v_mov_b32_e32 v3, s0
; G_VI-NEXT: s_waitcnt lgkmcnt(0)
; G_VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
@@ -1386,24 +1385,25 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
; G_GFX11-LABEL: lds_ds_fmax_f64:
; G_GFX11: ; %bb.0:
; G_GFX11-NEXT: s_clause 0x1
-; G_GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; G_GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; G_GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; G_GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; G_GFX11-NEXT: s_mov_b32 s0, 0
+; G_GFX11-NEXT: s_mov_b32 s1, 0x40450000
+; G_GFX11-NEXT: v_mov_b32_e32 v0, s0
; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT: s_add_i32 s4, s2, 4
-; G_GFX11-NEXT: s_mov_b32 s2, 0
-; G_GFX11-NEXT: s_mov_b32 s3, 0x40450000
+; G_GFX11-NEXT: s_add_i32 s4, s4, 4
+; G_GFX11-NEXT: v_mov_b32_e32 v5, s3
; G_GFX11-NEXT: s_lshl_b32 s5, s4, 3
-; G_GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s1
-; G_GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s5
-; G_GFX11-NEXT: s_lshl_b32 s2, s4, 4
-; G_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; G_GFX11-NEXT: v_mov_b32_e32 v4, s2
+; G_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; G_GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s5
+; G_GFX11-NEXT: s_lshl_b32 s0, s4, 4
+; G_GFX11-NEXT: v_mov_b32_e32 v4, s0
; G_GFX11-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1]
; G_GFX11-NEXT: ds_max_f64 v4, v[0:1]
; G_GFX11-NEXT: s_waitcnt lgkmcnt(1)
; G_GFX11-NEXT: ds_max_rtn_f64 v[0:1], v5, v[2:3]
; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT: scratch_store_b64 off, v[0:1], s0
+; G_GFX11-NEXT: scratch_store_b64 off, v[0:1], s2
; G_GFX11-NEXT: s_endpgm
%idx.add = add nuw i32 %idx, 4
%shl0 = shl i32 %idx.add, 3
diff --git a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll
index e1124f3..90623c0 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @load_zeroinit_lds_global(ptr addrspace(1) %out, i1 %p
; GCN: liveins: $sgpr0_sgpr1
; GCN: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
; GFX6: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0
- ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0
; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
index 01a1ab4..2c3e3fa 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
@@ -78,12 +78,12 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out,
;
; VI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s4, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -111,12 +111,12 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out,
; GFX11-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s4, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -166,14 +166,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr
; VI-LABEL: v_cvt_pkrtz_v2f16_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -219,12 +219,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v1, v1, v2
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -262,16 +262,16 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out,
;
; VI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v3, 1.0
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -279,35 +279,35 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, 1.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, v1, 1.0
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, v1, 1.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -340,16 +340,16 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out,
;
; VI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, 1.0, v3
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -357,35 +357,35 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, 1.0, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e32 v1, 1.0, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v1, 1.0, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -423,14 +423,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out,
; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -476,12 +476,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -v1, v2
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -525,14 +525,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out,
; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -578,12 +578,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, v1, -v2
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -627,14 +627,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou
; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -680,12 +680,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -v1, -v2
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -730,14 +730,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp
; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -783,12 +783,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -|v1|, -v2
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
index 224de95..edd88da 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
@@ -549,8 +549,8 @@ end:
; GCN-LABEL: {{^}}test_export_clustering:
; PREGFX11-DAG: v_mov_b32_e32 [[W0:v[0-9]+]], 0
; PREGFX11-DAG: v_mov_b32_e32 [[W1:v[0-9]+]], 1.0
-; PREGFX11-DAG: v_mov_b32_e32 [[X:v[0-9]+]], s0
-; PREGFX11-DAG: v_mov_b32_e32 [[Y:v[0-9]+]], s1
+; PREGFX11-DAG: v_mov_b32_e32 [[X:v[0-9]+]], s2
+; PREGFX11-DAG: v_mov_b32_e32 [[Y:v[0-9]+]], s3
; PREGFX11-DAG: v_add_f32_e{{32|64}} [[Z0:v[0-9]+]]
; PREGFX11-DAG: v_sub_f32_e{{32|64}} [[Z1:v[0-9]+]]
; PREGFX11: exp param0 [[X]], [[Y]], [[Z0]], [[W0]]{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
index a737c5e..0567b42 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
@@ -126,20 +126,20 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) {
;
; GISEL-GFX11-LABEL: v_fcmp_f32:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f32:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1]
+; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[2:3]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 -1)
store i32 %result, ptr addrspace(1) %out
@@ -150,13 +150,13 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_oeq:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -176,14 +176,14 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_oeq:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -208,13 +208,13 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_one:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -234,14 +234,14 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_one:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -266,13 +266,13 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_ogt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_lt_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -292,14 +292,14 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_ogt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_lt_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -324,13 +324,13 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_oge:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_le_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -350,14 +350,14 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_oge:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_le_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -382,13 +382,13 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_olt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -408,14 +408,14 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_olt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -440,13 +440,13 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_ole:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_ge_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -466,14 +466,14 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_ole:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_ge_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -498,13 +498,13 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_o:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_o_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -524,14 +524,14 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_o:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_o_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -556,13 +556,13 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_uo:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_u_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -582,14 +582,14 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_uo:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_u_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -614,13 +614,13 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_ueq:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_nlg_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -640,14 +640,14 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_ueq:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_nlg_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -672,13 +672,13 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_une:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -698,14 +698,14 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_une:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -730,13 +730,13 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_ugt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_nge_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -756,14 +756,14 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_ugt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_nge_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -788,13 +788,13 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_uge:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_ngt_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -814,14 +814,14 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_uge:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_ngt_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -846,13 +846,13 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_ult:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_nle_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -872,14 +872,14 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_ult:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_nle_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -904,13 +904,13 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_ule:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_nlt_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -930,14 +930,14 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_ule:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_nlt_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -961,47 +961,47 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_oeq:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_eq_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_oeq:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_eq_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_oeq:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_eq_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_oeq:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_eq_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 1)
store i32 %result, ptr addrspace(1) %out
@@ -1011,47 +1011,47 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_one:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_one:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_one:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_one:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 6)
store i32 %result, ptr addrspace(1) %out
@@ -1061,47 +1061,47 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_ogt:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_lt_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_ogt:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_lt_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_ogt:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_lt_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_ogt:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_lt_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 2)
store i32 %result, ptr addrspace(1) %out
@@ -1111,47 +1111,47 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_oge:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_le_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_oge:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_le_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_oge:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_le_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_oge:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_le_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 3)
store i32 %result, ptr addrspace(1) %out
@@ -1161,47 +1161,47 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_olt:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_gt_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_olt:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_gt_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_olt:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_gt_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_olt:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_gt_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 4)
store i32 %result, ptr addrspace(1) %out
@@ -1211,47 +1211,47 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_ole:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_ge_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_ole:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_ge_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_ole:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_ge_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_ole:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_ge_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 5)
store i32 %result, ptr addrspace(1) %out
@@ -1261,47 +1261,47 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_ueq:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_nlg_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_ueq:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_nlg_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_ueq:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_nlg_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_ueq:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_nlg_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 9)
store i32 %result, ptr addrspace(1) %out
@@ -1311,47 +1311,47 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_o:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_o_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_o:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_o_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_o:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_o_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_o:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_o_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 7)
store i32 %result, ptr addrspace(1) %out
@@ -1361,47 +1361,47 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_uo:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_u_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_uo:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_u_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_uo:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_u_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_uo:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_u_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 8)
store i32 %result, ptr addrspace(1) %out
@@ -1411,47 +1411,47 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_une:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_une:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_une:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_une:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 14)
store i32 %result, ptr addrspace(1) %out
@@ -1461,47 +1461,47 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_ugt:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_nge_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_ugt:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_nge_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_ugt:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_nge_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_ugt:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_nge_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 10)
store i32 %result, ptr addrspace(1) %out
@@ -1511,47 +1511,47 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_uge:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_ngt_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_uge:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_ngt_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_uge:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_ngt_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_uge:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_ngt_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 11)
store i32 %result, ptr addrspace(1) %out
@@ -1561,47 +1561,47 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_ult:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_nle_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_ult:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_nle_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_ult:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_nle_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_ult:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_nle_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 12)
store i32 %result, ptr addrspace(1) %out
@@ -1611,47 +1611,47 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_ule:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_nlt_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_ule:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_nlt_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_ule:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_nlt_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_ule:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_nlt_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 13)
store i32 %result, ptr addrspace(1) %out
@@ -1663,14 +1663,14 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half
; SDAG-GFX11-LABEL: v_fcmp_f16_oeq_with_fabs:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: s_lshr_b32 s3, s2, 16
+; SDAG-GFX11-NEXT: s_lshr_b32 s0, s4, 16
; SDAG-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s2, |s3|
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s0, s4, |s0|
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1691,15 +1691,15 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half
; GISEL-GFX11-LABEL: v_fcmp_f16_oeq_with_fabs:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: s_lshr_b32 s3, s2, 16
+; GISEL-GFX11-NEXT: s_lshr_b32 s0, s4, 16
; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s2, |s3|
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s0, s4, |s0|
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1727,14 +1727,14 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace(
; SDAG-GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: s_lshr_b32 s3, s2, 16
+; SDAG-GFX11-NEXT: s_lshr_b32 s0, s4, 16
; SDAG-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s2|, |s3|
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s0, |s4|, |s0|
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1755,15 +1755,15 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace(
; GISEL-GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: s_lshr_b32 s3, s2, 16
+; GISEL-GFX11-NEXT: s_lshr_b32 s0, s4, 16
; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s2|, |s3|
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s0, |s4|, |s0|
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1798,20 +1798,20 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) {
;
; GISEL-GFX11-LABEL: v_fcmp_f16:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f16:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1]
+; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[2:3]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 -1)
store i32 %result, ptr addrspace(1) %out
@@ -1823,13 +1823,13 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_oeq:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1849,14 +1849,14 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_oeq:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1882,13 +1882,13 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_one:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1908,14 +1908,14 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_one:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1941,13 +1941,13 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_ogt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_lt_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1967,14 +1967,14 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_ogt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_lt_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2000,13 +2000,13 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_oge:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_le_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2026,14 +2026,14 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_oge:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_le_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2059,13 +2059,13 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_olt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_gt_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2085,14 +2085,14 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_olt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_gt_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2118,13 +2118,13 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_ole:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_ge_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2144,14 +2144,14 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_ole:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_ge_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2177,13 +2177,13 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_ueq:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_nlg_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2203,14 +2203,14 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_ueq:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_nlg_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2236,13 +2236,13 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_une:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2262,14 +2262,14 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_une:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2295,13 +2295,13 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_ugt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_nge_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2321,14 +2321,14 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_ugt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_nge_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2354,13 +2354,13 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_uge:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_ngt_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2380,14 +2380,14 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_uge:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_ngt_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2413,13 +2413,13 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_ult:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_nle_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2439,14 +2439,14 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_ult:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_nle_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2471,13 +2471,13 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_o:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_o_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2497,14 +2497,14 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_o:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_o_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2529,13 +2529,13 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_uo:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_u_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2555,14 +2555,14 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_uo:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_u_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2587,13 +2587,13 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_ule:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_nlt_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2613,14 +2613,14 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_ule:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_nlt_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
index 7d41cf1..62a007e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
@@ -137,10 +137,10 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) {
;
; GFX11-GISEL-LABEL: v_fcmp_f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -151,10 +151,10 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) {
;
; GFX9-GISEL-LABEL: v_fcmp_f32:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[0:1], s[2:3]
; GFX9-GISEL-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f32:
@@ -163,10 +163,10 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) {
;
; VI-GISEL-LABEL: v_fcmp_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 -1)
@@ -178,15 +178,15 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_oeq:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_eq_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -206,29 +206,29 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_oeq:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_oeq:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 1)
@@ -240,15 +240,15 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_one:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_neq_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -268,29 +268,29 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_one:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_one:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 6)
@@ -302,15 +302,15 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_ogt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_lt_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -330,29 +330,29 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_ogt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_ogt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 2)
@@ -364,15 +364,15 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_oge:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_le_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -392,29 +392,29 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_oge:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_ge_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_oge:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_ge_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 3)
@@ -426,15 +426,15 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_olt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_gt_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -454,29 +454,29 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_olt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_olt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 4)
@@ -488,15 +488,15 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_ole:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_ge_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -516,29 +516,29 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_ole:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_le_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_ole:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_le_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 5)
@@ -550,15 +550,15 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_o:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_o_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_o_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -578,29 +578,29 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_o:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_o_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_o:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_o_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 7)
@@ -612,15 +612,15 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_uo:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_u_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_u_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -640,29 +640,29 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_uo:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_u_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_uo:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_u_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 8)
@@ -674,15 +674,15 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_ueq:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_nlg_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -702,29 +702,29 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_ueq:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_ueq:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 9)
@@ -736,15 +736,15 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_une:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_neq_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -764,29 +764,29 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_une:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_une:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 14)
@@ -798,15 +798,15 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_ugt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nge_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_nge_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -826,29 +826,29 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_ugt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nle_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_ugt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_nle_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 10)
@@ -860,15 +860,15 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_uge:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ngt_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_ngt_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -888,29 +888,29 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_uge:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nlt_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_uge:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_nlt_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 11)
@@ -922,15 +922,15 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_ult:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nle_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_nle_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -950,29 +950,29 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_ult:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nge_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_ult:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_nge_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 12)
@@ -984,15 +984,15 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_ule:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlt_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_nlt_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1012,29 +1012,29 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_ule:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_ngt_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_ule:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_ngt_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 13)
@@ -1045,56 +1045,56 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_oeq:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_eq_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_oeq:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_eq_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_oeq:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_eq_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_oeq:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_eq_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 1)
@@ -1105,56 +1105,56 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_one:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_neq_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_neq_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_one:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_one:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_one:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 6)
@@ -1165,56 +1165,56 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_ogt:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_lt_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_ogt:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_gt_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_ogt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_gt_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_ogt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_gt_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 2)
@@ -1225,56 +1225,56 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_oge:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_le_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_oge:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_ge_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_oge:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_ge_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_oge:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_ge_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 3)
@@ -1285,56 +1285,56 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_olt:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_gt_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_olt:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_lt_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_olt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_lt_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_olt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_lt_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 4)
@@ -1345,56 +1345,56 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_ole:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_ge_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_ole:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_le_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_ole:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_le_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_ole:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_le_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 5)
@@ -1405,56 +1405,56 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_ueq:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_nlg_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_ueq:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_nlg_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_ueq:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nlg_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_ueq:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_nlg_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 9)
@@ -1465,56 +1465,56 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_o:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_o_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_o_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_o:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_o_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_o:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_o_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_o:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_o_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 7)
@@ -1525,56 +1525,56 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_uo:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_u_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_u_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_uo:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_u_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_uo:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_u_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_uo:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_u_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 8)
@@ -1585,56 +1585,56 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_une:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_neq_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_neq_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_une:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_une:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_une:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 14)
@@ -1645,56 +1645,56 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_ugt:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nge_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_nge_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_ugt:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_nle_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_ugt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nle_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_ugt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_nle_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 10)
@@ -1705,56 +1705,56 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_uge:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ngt_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_ngt_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_uge:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_nlt_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_uge:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nlt_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_uge:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_nlt_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 11)
@@ -1765,56 +1765,56 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_ult:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nle_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_nle_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_ult:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_nge_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_ult:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nge_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_ult:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_nge_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 12)
@@ -1825,56 +1825,56 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_ule:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlt_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_nlt_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_ule:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_ngt_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_ule:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_ngt_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_ule:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_ngt_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 13)
@@ -1887,17 +1887,17 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half
; GFX11-LABEL: v_fcmp_f16_oeq_with_fabs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s3, s2, 16
+; GFX11-NEXT: s_lshr_b32 s0, s4, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |s3|
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, |s0|
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1918,31 +1918,31 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half
;
; VI-SDAG-LABEL: v_fcmp_f16_oeq_with_fabs:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_lshr_b32 s3, s2, 16
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3
-; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0|
+; VI-SDAG-NEXT: s_lshr_b32 s0, s4, 16
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, |v0|
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_oeq_with_fabs:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_lshr_b32 s3, s2, 16
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3
-; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0|
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: s_lshr_b32 s0, s4, 16
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, |v0|
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%temp = call half @llvm.fabs.f16(half %a)
@@ -1956,17 +1956,17 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace(
; GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s3, s2, 16
+; GFX11-NEXT: s_lshr_b32 s0, s4, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |s3|
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: v_cmp_eq_f16_e64 s[0:1], |s4|, |s0|
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1987,31 +1987,31 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace(
;
; VI-SDAG-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_lshr_b32 s3, s2, 16
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3
-; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0|
+; VI-SDAG-NEXT: s_lshr_b32 s0, s4, 16
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[0:1], |s4|, |v0|
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_lshr_b32 s3, s2, 16
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3
-; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0|
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: s_lshr_b32 s0, s4, 16
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[0:1], |s4|, |v0|
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%temp = call half @llvm.fabs.f16(half %a)
@@ -2028,10 +2028,10 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) {
;
; GFX11-GISEL-LABEL: v_fcmp_f16:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -2042,10 +2042,10 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) {
;
; GFX9-GISEL-LABEL: v_fcmp_f16:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[0:1], s[2:3]
; GFX9-GISEL-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f16:
@@ -2054,10 +2054,10 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) {
;
; VI-GISEL-LABEL: v_fcmp_f16:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 -1)
@@ -2070,15 +2070,15 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_oeq:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_eq_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2098,29 +2098,29 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_oeq:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_oeq:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 1)
@@ -2133,15 +2133,15 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_one:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_neq_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2161,29 +2161,29 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_one:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_one:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 6)
@@ -2196,15 +2196,15 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_ogt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_lt_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2224,29 +2224,29 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_ogt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_gt_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_ogt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_gt_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 2)
@@ -2259,15 +2259,15 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_oge:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_le_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2287,29 +2287,29 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_oge:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_ge_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_oge:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_ge_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 3)
@@ -2322,15 +2322,15 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_olt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_gt_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2350,29 +2350,29 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_olt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_lt_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_olt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_lt_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 4)
@@ -2385,15 +2385,15 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_ole:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_ge_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2413,29 +2413,29 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_ole:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_le_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_ole:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_le_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 5)
@@ -2448,15 +2448,15 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_ueq:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_nlg_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2476,29 +2476,29 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_ueq:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nlg_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_ueq:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_nlg_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 9)
@@ -2511,15 +2511,15 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_une:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_neq_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2539,29 +2539,29 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_une:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_une:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 14)
@@ -2574,15 +2574,15 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_ugt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nge_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_nge_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2602,29 +2602,29 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_ugt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nle_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_ugt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_nle_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 10)
@@ -2637,15 +2637,15 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_uge:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ngt_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_ngt_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2665,29 +2665,29 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_uge:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nlt_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_uge:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_nlt_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 11)
@@ -2700,15 +2700,15 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_ult:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nle_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_nle_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2728,29 +2728,29 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_ult:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nge_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_ult:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_nge_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 12)
@@ -2762,15 +2762,15 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_o:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_o_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_o_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2790,29 +2790,29 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_o:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_o_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_o:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_o_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 7)
@@ -2824,15 +2824,15 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_uo:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_u_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_u_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2852,29 +2852,29 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_uo:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_u_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_uo:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_u_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 8)
@@ -2886,15 +2886,15 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_ule:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlt_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_nlt_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2914,29 +2914,29 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_ule:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_ngt_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_ule:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_ngt_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 13)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
index ca06a57b..528d289e1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
@@ -8,15 +8,15 @@ declare bfloat @llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> %a, <2 x bfloat> %b, bf
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16(
; GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX11-NEXT: global_load_u16 v1, v0, s[10:11]
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[8:9], 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dot2_bf16_bf16 v1, s2, s3, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_dot2_bf16_bf16 v1, s0, s1, v1
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -34,18 +34,17 @@ entry:
}
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16_dpp(
-; SDAG-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp:
-; SDAG-GFX11: ; %bb.0: ; %entry
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: scratch_load_b32 v0, off, s2
-; SDAG-GFX11-NEXT: scratch_load_u16 v1, off, s3
-; SDAG-GFX11-NEXT: scratch_load_b32 v2, off, s1
-; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0)
-; SDAG-GFX11-NEXT: v_dot2_bf16_bf16_e64_dpp v0, v2, v0, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; SDAG-GFX11-NEXT: scratch_store_b16 off, v0, s0
-; SDAG-GFX11-NEXT: s_endpgm
-;
+; GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: scratch_load_b32 v0, off, s6
+; GFX11-NEXT: scratch_load_u16 v1, off, s7
+; GFX11-NEXT: scratch_load_b32 v2, off, s5
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_dot2_bf16_bf16_e64_dpp v0, v2, v0, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX11-NEXT: scratch_store_b16 off, v0, s4
+; GFX11-NEXT: s_endpgm
; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp:
; GISEL-GFX11: ; %bb.0: ; %entry
; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
@@ -95,3 +94,5 @@ entry:
}
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SDAG-GFX11: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
index 99c3dea..7edf3d6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
@@ -7,15 +7,15 @@ declare half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %a, <2 x half> %b, half %c)
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
; GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX11-NEXT: global_load_u16 v1, v0, s[10:11]
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[8:9], 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dot2_f16_f16 v1, s2, s3, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_dot2_f16_f16 v1, s0, s1, v1
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -35,26 +35,26 @@ entry:
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp(
; SDAG-GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp:
; SDAG-GFX11: ; %bb.0: ; %entry
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: scratch_load_b32 v0, off, s2
-; SDAG-GFX11-NEXT: scratch_load_u16 v1, off, s3
-; SDAG-GFX11-NEXT: scratch_load_b32 v2, off, s1
+; SDAG-GFX11-NEXT: scratch_load_b32 v0, off, s6
+; SDAG-GFX11-NEXT: scratch_load_u16 v1, off, s7
+; SDAG-GFX11-NEXT: scratch_load_b32 v2, off, s5
; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0)
; SDAG-GFX11-NEXT: v_dot2_f16_f16_e64_dpp v0, v2, v0, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; SDAG-GFX11-NEXT: scratch_store_b16 off, v0, s0
+; SDAG-GFX11-NEXT: scratch_store_b16 off, v0, s4
; SDAG-GFX11-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp:
; GISEL-GFX11: ; %bb.0: ; %entry
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: scratch_load_b32 v0, off, s1
-; GISEL-GFX11-NEXT: scratch_load_b32 v1, off, s2
-; GISEL-GFX11-NEXT: scratch_load_u16 v2, off, s3
+; GISEL-GFX11-NEXT: scratch_load_b32 v0, off, s5
+; GISEL-GFX11-NEXT: scratch_load_b32 v1, off, s6
+; GISEL-GFX11-NEXT: scratch_load_u16 v2, off, s7
; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX11-NEXT: v_dot2_f16_f16_e64_dpp v0, v0, v1, v2 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GISEL-GFX11-NEXT: scratch_store_b16 off, v0, s0
+; GISEL-GFX11-NEXT: scratch_store_b16 off, v0, s4
; GISEL-GFX11-NEXT: s_endpgm
ptr addrspace(5) %r,
ptr addrspace(5) %a,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
index e51b1d2..40c6925 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
@@ -7,16 +7,16 @@ declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, floa
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[10:11], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
+; GFX11-NEXT: s_load_b32 s2, s[8:9], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 clamp
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_dot2_f32_bf16 v0, s1, s2, v0 clamp
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -37,16 +37,16 @@ entry:
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[10:11], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
+; GFX11-NEXT: s_load_b32 s2, s[8:9], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_dot2_f32_bf16 v0, s1, s2, v0
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll
index 434fa1b..690362c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll
@@ -7,20 +7,20 @@ declare i64 @llvm.amdgcn.global.atomic.ordered.add.b64(ptr addrspace(1), i64)
define amdgpu_kernel void @global_atomic_ordered_add_b64_no_rtn(ptr addrspace(1) %addr, i64 %in) {
; GFX12-SDAG-LABEL: global_atomic_ordered_add_b64_no_rtn:
; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-SDAG-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[0:1] offset:-32 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:-32 th:TH_ATOMIC_RETURN
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: global_atomic_ordered_add_b64_no_rtn:
; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-GISEL-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[0:1] offset:-32 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:-32 th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 -4
@@ -31,14 +31,15 @@ entry:
define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %addr, i64 %in, ptr addrspace(1) %use) {
; GFX12-SDAG-LABEL: global_atomic_ordered_add_b64_rtn:
; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
; GFX12-SDAG-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -47,13 +48,13 @@ define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %a
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
; GFX12-GISEL-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
index f6197e0..c2eb771 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
@@ -8,12 +8,12 @@ declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1))
define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX12-LABEL: global_load_tr_b64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32
+; GFX12-NEXT: global_load_tr_b64 v[0:1], v2, s[4:5] offset:32
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -27,12 +27,12 @@ entry:
define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX12-LABEL: global_load_tr_b128:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32
+; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[4:5] offset:32
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[2:3]
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
index a2dc366..96835c6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
@@ -8,12 +8,12 @@ declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1))
define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX12-LABEL: global_load_tr_b64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32
+; GFX12-NEXT: global_load_tr_b64 v1, v0, s[4:5] offset:32
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX12-NEXT: global_store_b32 v0, v1, s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -27,12 +27,12 @@ entry:
define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX12-LABEL: global_load_tr_b128:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32
+; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[4:5] offset:32
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
index ae61b58..1e1ea10 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
@@ -22,13 +22,13 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_eq:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -48,14 +48,14 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_eq:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -87,20 +87,20 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-GFX11-LABEL: v_icmp_i32:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_i32:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1]
+; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[2:3]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 30)
store i32 %result, ptr addrspace(1) %out
@@ -111,13 +111,13 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_ne:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -137,14 +137,14 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_ne:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -169,13 +169,13 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_ugt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_lt_u32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -195,14 +195,14 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_ugt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_lt_u32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -227,13 +227,13 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_uge:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_le_u32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -253,14 +253,14 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_uge:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_le_u32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -285,13 +285,13 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_ult:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_gt_u32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -311,14 +311,14 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_ult:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_gt_u32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -343,13 +343,13 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_ule:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_ge_u32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -369,14 +369,14 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_ule:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_ge_u32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -401,13 +401,13 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 {
; SDAG-GFX11-LABEL: v_icmp_i32_sgt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_lt_i32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -427,14 +427,14 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 {
; GISEL-GFX11-LABEL: v_icmp_i32_sgt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_lt_i32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -459,13 +459,13 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_sge:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_le_i32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -485,14 +485,14 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_sge:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_le_i32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -517,13 +517,13 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_slt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_gt_i32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -543,14 +543,14 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_slt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_gt_i32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -575,13 +575,13 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_sle:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_ge_i32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -601,14 +601,14 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_sle:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_ge_i32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -632,47 +632,47 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_i64_eq:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_i64_eq:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_eq_u64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_i64_eq:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_i64_eq:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_eq_u64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 32)
store i32 %result, ptr addrspace(1) %out
@@ -682,47 +682,47 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_i64_ne:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_ne_u64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_i64_ne:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_ne_u64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_i64_ne:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_ne_u64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_i64_ne:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_ne_u64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 33)
store i32 %result, ptr addrspace(1) %out
@@ -732,47 +732,47 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_u64_ugt:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_lt_u64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_u64_ugt:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_lt_u64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_u64_ugt:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_lt_u64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_u64_ugt:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_lt_u64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 34)
store i32 %result, ptr addrspace(1) %out
@@ -782,47 +782,47 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_u64_uge:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_le_u64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_u64_uge:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_le_u64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_u64_uge:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_le_u64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_u64_uge:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_le_u64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 35)
store i32 %result, ptr addrspace(1) %out
@@ -832,47 +832,47 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_u64_ult:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_gt_u64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_u64_ult:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_gt_u64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_u64_ult:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_gt_u64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_u64_ult:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_gt_u64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 36)
store i32 %result, ptr addrspace(1) %out
@@ -882,47 +882,47 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_u64_ule:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_ge_u64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_u64_ule:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_ge_u64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_u64_ule:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_ge_u64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_u64_ule:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_ge_u64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 37)
store i32 %result, ptr addrspace(1) %out
@@ -932,47 +932,47 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_i64_sgt:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_i64_sgt:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_i64_sgt:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_i64_sgt:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 38)
store i32 %result, ptr addrspace(1) %out
@@ -982,47 +982,47 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_i64_sge:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_le_i64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_i64_sge:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_le_i64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_i64_sge:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_le_i64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_i64_sge:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_le_i64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 39)
store i32 %result, ptr addrspace(1) %out
@@ -1032,47 +1032,47 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_i64_slt:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_i64_slt:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_i64_slt:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_i64_slt:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 40)
store i32 %result, ptr addrspace(1) %out
@@ -1082,47 +1082,47 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_i64_sle:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_ge_i64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_i64_sle:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_ge_i64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_i64_sle:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_ge_i64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_i64_sle:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_ge_i64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 41)
store i32 %result, ptr addrspace(1) %out
@@ -1133,13 +1133,13 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_eq:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_eq_u16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1159,14 +1159,14 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_eq:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_eq_u16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1198,20 +1198,20 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-GFX11-LABEL: v_icmp_i16:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_i16:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1]
+; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[2:3]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 30)
store i32 %result, ptr addrspace(1) %out
@@ -1222,13 +1222,13 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_ne:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_ne_u16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1248,14 +1248,14 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_ne:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_ne_u16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1280,13 +1280,13 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_ugt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_lt_u16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1306,14 +1306,14 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_ugt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_lt_u16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1338,13 +1338,13 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_uge:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_le_u16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1364,14 +1364,14 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_uge:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_le_u16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1396,13 +1396,13 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_ult:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_gt_u16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1422,14 +1422,14 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_ult:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_gt_u16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1454,13 +1454,13 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_ule:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_ge_u16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1480,14 +1480,14 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_ule:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_ge_u16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1512,13 +1512,13 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 {
; SDAG-GFX11-LABEL: v_icmp_i16_sgt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_lt_i16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1538,14 +1538,14 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 {
; GISEL-GFX11-LABEL: v_icmp_i16_sgt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_lt_i16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1570,13 +1570,13 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_sge:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_le_i16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1596,14 +1596,14 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_sge:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_le_i16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1628,13 +1628,13 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_slt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_gt_i16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1654,14 +1654,14 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_slt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_gt_i16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1686,13 +1686,13 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_sle:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_ge_i16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1712,14 +1712,14 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_sle:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_ge_i16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
index 54931ac..ae285c8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
@@ -25,30 +25,30 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_eq:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_eq_u32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_eq:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -67,15 +67,15 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_eq:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 32)
@@ -98,29 +98,29 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-GFX11-LABEL: v_icmp_i32:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_i32:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GISEL-VI-NEXT: s_endpgm
;
; GISEL-GFX9-LABEL: v_icmp_i32:
; GISEL-GFX9: ; %bb.0:
-; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GISEL-GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[2:3]
; GISEL-GFX9-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 30)
store i64 %result, ptr addrspace(1) %out
@@ -131,30 +131,30 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_ne:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_ne_u32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_ne:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_ne_u32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -173,15 +173,15 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_ne:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ne_u32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 33)
@@ -193,30 +193,30 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_ugt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_u32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_lt_u32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_ugt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_gt_u32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -235,15 +235,15 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_ugt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_gt_u32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 34)
@@ -255,30 +255,30 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_uge:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_u32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_le_u32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_uge:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_ge_u32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -297,15 +297,15 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_uge:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ge_u32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 35)
@@ -317,30 +317,30 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_ult:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_u32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_gt_u32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_ult:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_lt_u32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -359,15 +359,15 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_ult:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_lt_u32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 36)
@@ -379,30 +379,30 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_ule:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_u32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_ge_u32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_ule:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -421,15 +421,15 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_ule:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 37)
@@ -441,30 +441,30 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 {
; GFX11-LABEL: v_icmp_i32_sgt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_i32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_lt_i32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_sgt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_gt_i32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -483,15 +483,15 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 {
;
; GISEL-VI-LABEL: v_icmp_i32_sgt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_gt_i32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 38)
@@ -503,30 +503,30 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_sge:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_i32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_le_i32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_sge:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_ge_i32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -545,15 +545,15 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_sge:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ge_i32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 39)
@@ -565,30 +565,30 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_slt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_i32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_gt_i32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_slt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_lt_i32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -607,15 +607,15 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_slt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_lt_i32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 40)
@@ -627,30 +627,30 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_sle:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_i32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_ge_i32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_sle:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_le_i32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -669,15 +669,15 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_sle:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_le_i32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 41)
@@ -688,56 +688,56 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_i64_eq:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i64_eq:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_i64_eq:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_i64_eq:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 32)
@@ -748,56 +748,56 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_i64_ne:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_ne_u64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i64_ne:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_ne_u64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_i64_ne:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_ne_u64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_i64_ne:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ne_u64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 33)
@@ -808,56 +808,56 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_u64_ugt:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_u64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_lt_u64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_u64_ugt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_gt_u64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_u64_ugt:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_gt_u64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_u64_ugt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_gt_u64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 34)
@@ -868,56 +868,56 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_u64_uge:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_u64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_le_u64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_u64_uge:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_ge_u64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_u64_uge:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_ge_u64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_u64_uge:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ge_u64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 35)
@@ -928,56 +928,56 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_u64_ult:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_u64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_gt_u64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_u64_ult:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_u64_ult:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_u64_ult:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 36)
@@ -988,56 +988,56 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_u64_ule:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_u64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_ge_u64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_u64_ule:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_le_u64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_u64_ule:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_le_u64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_u64_ule:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_le_u64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 37)
@@ -1048,56 +1048,56 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_i64_sgt:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_i64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_lt_i64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i64_sgt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_i64_sgt:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_i64_sgt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 38)
@@ -1108,56 +1108,56 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_i64_sge:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_i64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_le_i64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i64_sge:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_ge_i64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_i64_sge:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_ge_i64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_i64_sge:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ge_i64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 39)
@@ -1168,56 +1168,56 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_i64_slt:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_i64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_gt_i64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i64_slt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_i64_slt:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_i64_slt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 40)
@@ -1228,56 +1228,56 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_i64_sle:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_i64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_ge_i64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i64_sle:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_le_i64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_i64_sle:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_le_i64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_i64_sle:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_le_i64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 41)
@@ -1289,30 +1289,30 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_eq:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_eq_u16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_eq:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_eq_u16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1331,15 +1331,15 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_eq:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_eq_u16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 32)
@@ -1362,29 +1362,29 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-GFX11-LABEL: v_icmp_i16:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_i16:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GISEL-VI-NEXT: s_endpgm
;
; GISEL-GFX9-LABEL: v_icmp_i16:
; GISEL-GFX9: ; %bb.0:
-; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GISEL-GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[2:3]
; GISEL-GFX9-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 30)
store i64 %result, ptr addrspace(1) %out
@@ -1395,30 +1395,30 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_ne:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_ne_u16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_ne:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_ne_u16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1437,15 +1437,15 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_ne:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ne_u16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 33)
@@ -1457,30 +1457,30 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_ugt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_u16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_lt_u16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_ugt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_gt_u16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1499,15 +1499,15 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_ugt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_gt_u16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 34)
@@ -1519,30 +1519,30 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_uge:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_u16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_le_u16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_uge:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_ge_u16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1561,15 +1561,15 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_uge:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ge_u16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 35)
@@ -1581,30 +1581,30 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_ult:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_u16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_gt_u16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_ult:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_lt_u16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1623,15 +1623,15 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_ult:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_lt_u16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 36)
@@ -1643,30 +1643,30 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_ule:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_u16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_ge_u16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_ule:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_le_u16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1685,15 +1685,15 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_ule:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_le_u16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 37)
@@ -1705,30 +1705,30 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 {
; GFX11-LABEL: v_icmp_i16_sgt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_i16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_lt_i16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_sgt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_gt_i16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1747,15 +1747,15 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 {
;
; GISEL-VI-LABEL: v_icmp_i16_sgt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_gt_i16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 38)
@@ -1767,30 +1767,30 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_sge:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_i16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_le_i16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_sge:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_ge_i16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1809,15 +1809,15 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_sge:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ge_i16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 39)
@@ -1829,30 +1829,30 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_slt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_i16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_gt_i16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_slt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_lt_i16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1871,15 +1871,15 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_slt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_lt_i16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 40)
@@ -1891,30 +1891,30 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_sle:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_i16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_ge_i16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_sle:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_le_i16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1933,15 +1933,15 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_sle:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_le_i16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 41)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
index 3a77b3b..cffd9a6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
@@ -349,9 +349,10 @@ main_body:
define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign:
; GFX1013: ; %bb.0: ; %main_body
-; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1013-NEXT: s_clause 0x1
+; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX1013-NEXT: v_mov_b32_e32 v3, 0
; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0
; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0
@@ -362,22 +363,23 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000
; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000
; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT: v_add_co_u32 v0, s4, s4, v0
-; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s4, s5, 0, s4
+; GFX1013-NEXT: v_add_co_u32 v0, s0, s2, v0
+; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
; GFX1013-NEXT: flat_load_dword v2, v[0:1]
; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7
; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
+; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[4:7]
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1013-NEXT: s_endpgm
;
; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign:
; GFX1030: ; %bb.0: ; %main_body
-; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1030-NEXT: s_clause 0x1
+; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX1030-NEXT: v_mov_b32_e32 v3, 0
; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000
; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000
@@ -388,36 +390,37 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0
; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_add_co_u32 v0, s4, s4, v0
-; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
+; GFX1030-NEXT: v_add_co_u32 v0, s0, s2, v0
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
; GFX1030-NEXT: flat_load_dword v2, v[0:1]
; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102
; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7
; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
+; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[4:7]
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1030-NEXT: s_endpgm
;
; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign:
; GFX11: ; %bb.0: ; %main_body
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000
; GFX11-NEXT: v_dual_mov_b32 v3, 0x40400000 :: v_dual_mov_b32 v4, 4.0
; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0
; GFX11-NEXT: v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v9, 0xb36211c7
; GFX11-NEXT: v_dual_mov_b32 v10, 0x102 :: v_dual_mov_b32 v7, 1.0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0
+; GFX11-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
; GFX11-NEXT: flat_load_b32 v11, v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000
; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[6:8], v[3:5], v[0:2]], s[0:3]
+; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[6:8], v[3:5], v[0:2]], s[4:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_endpgm
@@ -442,9 +445,10 @@ main_body:
define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
; GFX1013: ; %bb.0: ; %main_body
-; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1013-NEXT: s_clause 0x1
+; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX1013-NEXT: v_mov_b32_e32 v3, 0
; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0
; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0
@@ -452,22 +456,23 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500
; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700
; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT: v_add_co_u32 v0, s4, s4, v0
-; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s4, s5, 0, s4
+; GFX1013-NEXT: v_add_co_u32 v0, s0, s2, v0
+; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
; GFX1013-NEXT: flat_load_dword v2, v[0:1]
; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6
; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
+; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1013-NEXT: s_endpgm
;
; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
; GFX1030: ; %bb.0: ; %main_body
-; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1030-NEXT: s_clause 0x1
+; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX1030-NEXT: v_mov_b32_e32 v3, 0
; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0
; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0
@@ -475,34 +480,35 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500
; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_add_co_u32 v0, s4, s4, v0
-; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
+; GFX1030-NEXT: v_add_co_u32 v0, s0, s2, v0
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
; GFX1030-NEXT: flat_load_dword v2, v[0:1]
; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102
; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6
; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
+; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1030-NEXT: s_endpgm
;
; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
; GFX11: ; %bb.0: ; %main_body
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34
; GFX11-NEXT: v_dual_mov_b32 v2, 0x48004500 :: v_dual_mov_b32 v5, 2.0
; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v7, 0x102
; GFX11-NEXT: v_dual_mov_b32 v6, 0xb36211c6 :: v_dual_mov_b32 v3, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0
+; GFX11-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
; GFX11-NEXT: flat_load_b32 v8, v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200
; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[3:5], v[0:2]], s[0:3] a16
+; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[3:5], v[0:2]], s[4:7] a16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index 634159a..c9bdc70 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -72,13 +72,13 @@ define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0
; GFX11-LABEL: v_permlane16_b32_vii:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_permlane16_b32 v0, v0, 1, 2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -117,14 +117,14 @@ define amdgpu_kernel void @v_permlane16_b32_vll(ptr addrspace(1) %out, i32 %src0
; GFX11-LABEL: v_permlane16_b32_vll:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_movk_i32 s0, 0x1234
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: s_movk_i32 s2, 0x1234
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -165,19 +165,19 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0
; GFX11-SDAG-LABEL: v_permlane16_b32_vvv:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_clause 0x1
-; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1
-; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
@@ -185,18 +185,18 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0
; GFX11-GISEL-LABEL: v_permlane16_b32_vvv:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_clause 0x1
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v1
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -596,13 +596,13 @@ define amdgpu_kernel void @v_permlanex16_b32_vii(ptr addrspace(1) %out, i32 %src
; GFX11-LABEL: v_permlanex16_b32_vii:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -641,14 +641,14 @@ define amdgpu_kernel void @v_permlanex16_b32_vll(ptr addrspace(1) %out, i32 %src
; GFX11-LABEL: v_permlanex16_b32_vll:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_movk_i32 s0, 0x1234
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: s_movk_i32 s2, 0x1234
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -689,19 +689,19 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src
; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_clause 0x1
-; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1
-; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
@@ -709,18 +709,18 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src
; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_clause 0x1
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v1
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1075,11 +1075,11 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid(ptr addrspace(1) %out, i32 %
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1088,11 +1088,11 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid(ptr addrspace(1) %out, i32 %
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1118,11 +1118,11 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid(ptr addrspace(1) %out, i32
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1131,11 +1131,11 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid(ptr addrspace(1) %out, i32
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1175,12 +1175,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %sr
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3
-; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
@@ -1189,13 +1189,13 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %sr
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1204,12 +1204,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %sr
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3
-; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[4:5]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -1218,13 +1218,13 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %sr
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -1250,11 +1250,11 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(ptr addrspace(1) %out, i32
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1263,11 +1263,11 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(ptr addrspace(1) %out, i32
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1294,11 +1294,11 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(ptr addrspace(1) %out, i32
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1307,11 +1307,11 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(ptr addrspace(1) %out, i32
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1338,11 +1338,11 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1351,11 +1351,11 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1382,11 +1382,11 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid(ptr addrspace(1) %out, i32
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1395,11 +1395,11 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid(ptr addrspace(1) %out, i32
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1425,11 +1425,11 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid(ptr addrspace(1) %out, i3
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1438,11 +1438,11 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid(ptr addrspace(1) %out, i3
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1482,12 +1482,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %s
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3
-; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
@@ -1496,13 +1496,13 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %s
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1511,12 +1511,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %s
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3
-; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[4:5]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -1525,13 +1525,13 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %s
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -1557,11 +1557,11 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(ptr addrspace(1) %out, i32
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1570,11 +1570,11 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(ptr addrspace(1) %out, i32
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1601,11 +1601,11 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(ptr addrspace(1) %out, i32
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1614,11 +1614,11 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(ptr addrspace(1) %out, i32
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1645,11 +1645,11 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1658,11 +1658,11 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(ptr addrspace(1) %out,
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
index 77a975f..2cc49c3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
@@ -445,13 +445,13 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3
; GFX12-SDAG-LABEL: v_permlane16var_b32_tid_tid:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -459,14 +459,14 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3
; GFX12-GISEL-LABEL: v_permlane16var_b32_tid_tid:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -480,13 +480,13 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out,
; GFX12-SDAG-LABEL: v_permlane16var_b32_undef_tid:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -494,14 +494,14 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out,
; GFX12-GISEL-LABEL: v_permlane16var_b32_undef_tid:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -516,14 +516,14 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32
; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_var_b32 v1, v0, v2
-; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -531,14 +531,14 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32
; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s2
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v0, v2
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -552,13 +552,13 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i
; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_fi:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0]
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -566,14 +566,14 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i
; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_fi:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0]
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -588,13 +588,13 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i
; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_bc:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1]
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -602,14 +602,14 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i
; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_bc:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1]
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -624,13 +624,13 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out
; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_fi_bc:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1]
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -638,14 +638,14 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out
; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_fi_bc:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1]
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -660,13 +660,13 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i
; GFX12-SDAG-LABEL: v_permlanex16var_b32_tid_tid:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -674,14 +674,14 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i
; GFX12-GISEL-LABEL: v_permlanex16var_b32_tid_tid:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -695,13 +695,13 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out,
; GFX12-SDAG-LABEL: v_permlanex16var_b32_undef_tid:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -709,14 +709,14 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out,
; GFX12-GISEL-LABEL: v_permlanex16var_b32_undef_tid:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -731,14 +731,14 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32
; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v1, v0, v2
-; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -746,14 +746,14 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32
; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s2
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v0, v2
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -767,13 +767,13 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out,
; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_fi:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0]
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -781,14 +781,14 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out,
; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_fi:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0]
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -803,13 +803,13 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out,
; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_bc:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1]
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -817,14 +817,14 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out,
; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_bc:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1]
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -839,13 +839,13 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou
; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_fi_bc:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1]
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -853,14 +853,14 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou
; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_fi_bc:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1]
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
index b81cb97..84edbb8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
@@ -9,13 +9,13 @@ define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) {
; GFX11-LABEL: test_s:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_permlane64_b32 v0, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -27,12 +27,12 @@ define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) {
define amdgpu_kernel void @test_i(ptr addrspace(1) %out) {
; GFX11-LABEL: test_i:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_permlane64_b32 v0, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -44,22 +44,22 @@ define amdgpu_kernel void @test_i(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 {
; GFX11-SDAG-LABEL: test_v:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_v:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll
index cb511c9..bf3d0a5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll
@@ -8,11 +8,11 @@
define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x:
; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; PREGFX10-UNPACKED-NEXT: s_load_dword s2, s[0:1], 0x34
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
-; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s2
+; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
; PREGFX10-UNPACKED-NEXT: s_endpgm
;
; PREGFX10-PACKED-LABEL: tbuffer_store_d16_x:
@@ -37,11 +37,11 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %dat
; GFX11-PACKED-LABEL: tbuffer_store_d16_x:
; GFX11-PACKED: ; %bb.0: ; %main_body
; GFX11-PACKED-NEXT: s_clause 0x1
-; GFX11-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34
-; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-PACKED-NEXT: s_load_b32 s2, s[0:1], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[4:7], 0 format:[BUF_FMT_10_10_10_2_SNORM]
; GFX11-PACKED-NEXT: s_nop 0
; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PACKED-NEXT: s_endpgm
@@ -53,14 +53,14 @@ main_body:
define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x half> %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy:
; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; PREGFX10-UNPACKED-NEXT: s_load_dword s2, s[0:1], 0x34
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
-; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s5, s4, 16
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s5
-; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xy v[0:1], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s0, s2, 16
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s1, s2, 0xffff
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s1
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s0
+; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xy v[0:1], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
; PREGFX10-UNPACKED-NEXT: s_endpgm
;
; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xy:
@@ -85,11 +85,11 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x hal
; GFX11-PACKED-LABEL: tbuffer_store_d16_xy:
; GFX11-PACKED: ; %bb.0: ; %main_body
; GFX11-PACKED-NEXT: s_clause 0x1
-; GFX11-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34
-; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-PACKED-NEXT: s_load_b32 s2, s[0:1], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[4:7], 0 format:[BUF_FMT_10_10_10_2_SNORM]
; GFX11-PACKED-NEXT: s_nop 0
; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PACKED-NEXT: s_endpgm
@@ -101,16 +101,16 @@ main_body:
define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x half> %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz:
; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff
-; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s4, 16
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s6
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5
-; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyz v[0:2], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s0, s3, 0xffff
+; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s1, s2, 16
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s2, s2, 0xffff
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s2
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s1
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s0
+; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyz v[0:2], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
; PREGFX10-UNPACKED-NEXT: s_endpgm
;
; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz:
@@ -139,13 +139,13 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x ha
; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz:
; GFX11-PACKED: ; %bb.0: ; %main_body
; GFX11-PACKED-NEXT: s_clause 0x1
-; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-PACKED-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5
-; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: s_and_b32 s0, s3, 0xffff
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_10_10_2_SNORM]
; GFX11-PACKED-NEXT: s_nop 0
; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PACKED-NEXT: s_endpgm
@@ -158,18 +158,18 @@ main_body:
define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw:
; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
-; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s5, 16
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff
-; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s4, 16
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s7
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s6
-; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:3], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s0, s3, 16
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s1, s3, 0xffff
+; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s3, s2, 16
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s2, s2, 0xffff
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s2
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s3
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s1
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s0
+; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:3], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
; PREGFX10-UNPACKED-NEXT: s_endpgm
;
; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw:
@@ -196,12 +196,12 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x h
; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw:
; GFX11-PACKED: ; %bb.0: ; %main_body
; GFX11-PACKED-NEXT: s_clause 0x1
-; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-PACKED-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5
-; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_10_10_2_SNORM]
; GFX11-PACKED-NEXT: s_nop 0
; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PACKED-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
index 01df7634..2be7ec2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
@@ -10,11 +10,11 @@
define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x:
; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; PREGFX10-UNPACKED-NEXT: s_load_dword s2, s[0:1], 0x34
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
-; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s2
+; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
; PREGFX10-UNPACKED-NEXT: s_endpgm
;
; PREGFX10-PACKED-LABEL: tbuffer_store_d16_x:
@@ -39,11 +39,11 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) {
; GFX11-PACKED-LABEL: tbuffer_store_d16_x:
; GFX11-PACKED: ; %bb.0: ; %main_body
; GFX11-PACKED-NEXT: s_clause 0x1
-; GFX11-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34
-; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-PACKED-NEXT: s_load_b32 s2, s[0:1], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[4:7], 0 format:[BUF_FMT_10_10_10_2_SNORM]
; GFX11-PACKED-NEXT: s_nop 0
; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PACKED-NEXT: s_endpgm
@@ -51,11 +51,11 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) {
; GFX12-PACKED-LABEL: tbuffer_store_d16_x:
; GFX12-PACKED: ; %bb.0: ; %main_body
; GFX12-PACKED-NEXT: s_clause 0x1
-; GFX12-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34
-; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-PACKED-NEXT: s_load_b32 s2, s[0:1], 0x34
+; GFX12-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0
-; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[4:7], null format:[BUF_FMT_10_10_10_2_SNORM]
; GFX12-PACKED-NEXT: s_nop 0
; GFX12-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-PACKED-NEXT: s_endpgm
@@ -67,14 +67,14 @@ main_body:
define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy:
; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; PREGFX10-UNPACKED-NEXT: s_load_dword s2, s[0:1], 0x34
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
-; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s5, s4, 16
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s5
-; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xy v[0:1], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s0, s2, 16
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s1, s2, 0xffff
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s1
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s0
+; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xy v[0:1], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
; PREGFX10-UNPACKED-NEXT: s_endpgm
;
; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xy:
@@ -99,11 +99,11 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat
; GFX11-PACKED-LABEL: tbuffer_store_d16_xy:
; GFX11-PACKED: ; %bb.0: ; %main_body
; GFX11-PACKED-NEXT: s_clause 0x1
-; GFX11-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34
-; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-PACKED-NEXT: s_load_b32 s2, s[0:1], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[4:7], 0 format:[BUF_FMT_10_10_10_2_SNORM]
; GFX11-PACKED-NEXT: s_nop 0
; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PACKED-NEXT: s_endpgm
@@ -111,11 +111,11 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat
; GFX12-PACKED-LABEL: tbuffer_store_d16_xy:
; GFX12-PACKED: ; %bb.0: ; %main_body
; GFX12-PACKED-NEXT: s_clause 0x1
-; GFX12-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34
-; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-PACKED-NEXT: s_load_b32 s2, s[0:1], 0x34
+; GFX12-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0
-; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[4:7], null format:[BUF_FMT_10_10_10_2_SNORM]
; GFX12-PACKED-NEXT: s_nop 0
; GFX12-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-PACKED-NEXT: s_endpgm
@@ -127,16 +127,16 @@ main_body:
define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz:
; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff
-; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s4, 16
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s6
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5
-; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyz v[0:2], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s0, s3, 0xffff
+; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s1, s2, 16
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s2, s2, 0xffff
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s2
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s1
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s0
+; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyz v[0:2], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
; PREGFX10-UNPACKED-NEXT: s_endpgm
;
; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz:
@@ -165,13 +165,13 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz:
; GFX11-PACKED: ; %bb.0: ; %main_body
; GFX11-PACKED-NEXT: s_clause 0x1
-; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-PACKED-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5
-; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: s_and_b32 s0, s3, 0xffff
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_10_10_2_SNORM]
; GFX11-PACKED-NEXT: s_nop 0
; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PACKED-NEXT: s_endpgm
@@ -179,13 +179,13 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
; GFX12-PACKED-SDAG-LABEL: tbuffer_store_d16_xyz:
; GFX12-PACKED-SDAG: ; %bb.0: ; %main_body
; GFX12-PACKED-SDAG-NEXT: s_clause 0x1
-; GFX12-PACKED-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-PACKED-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-PACKED-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-PACKED-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-PACKED-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-PACKED-SDAG-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v1, s5
-; GFX12-PACKED-SDAG-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX12-PACKED-SDAG-NEXT: s_and_b32 s0, s3, 0xffff
+; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-PACKED-SDAG-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[4:7], null format:[BUF_FMT_10_10_10_2_SNORM]
; GFX12-PACKED-SDAG-NEXT: s_nop 0
; GFX12-PACKED-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-PACKED-SDAG-NEXT: s_endpgm
@@ -193,14 +193,14 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
; GFX12-PACKED-GISEL-LABEL: tbuffer_store_d16_xyz:
; GFX12-PACKED-GISEL: ; %bb.0: ; %main_body
; GFX12-PACKED-GISEL-NEXT: s_clause 0x1
-; GFX12-PACKED-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-PACKED-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-PACKED-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-PACKED-GISEL-NEXT: s_pack_lh_b32_b16 s4, s4, s4
+; GFX12-PACKED-GISEL-NEXT: s_pack_lh_b32_b16 s2, s2, s2
; GFX12-PACKED-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX12-PACKED-GISEL-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-PACKED-GISEL-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[4:7], null format:[BUF_FMT_10_10_10_2_SNORM]
; GFX12-PACKED-GISEL-NEXT: s_nop 0
; GFX12-PACKED-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-PACKED-GISEL-NEXT: s_endpgm
@@ -213,18 +213,18 @@ main_body:
define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw:
; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
-; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s5, 16
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff
-; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s4, 16
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s7
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s6
-; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:3], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s0, s3, 16
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s1, s3, 0xffff
+; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s3, s2, 16
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s2, s2, 0xffff
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s2
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s3
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s1
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s0
+; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:3], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
; PREGFX10-UNPACKED-NEXT: s_endpgm
;
; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw:
@@ -251,12 +251,12 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d
; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw:
; GFX11-PACKED: ; %bb.0: ; %main_body
; GFX11-PACKED-NEXT: s_clause 0x1
-; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-PACKED-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5
-; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_10_10_2_SNORM]
; GFX11-PACKED-NEXT: s_nop 0
; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PACKED-NEXT: s_endpgm
@@ -264,12 +264,12 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d
; GFX12-PACKED-LABEL: tbuffer_store_d16_xyzw:
; GFX12-PACKED: ; %bb.0: ; %main_body
; GFX12-PACKED-NEXT: s_clause 0x1
-; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-PACKED-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0
-; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5
-; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[4:7], null format:[BUF_FMT_10_10_10_2_SNORM]
; GFX12-PACKED-NEXT: s_nop 0
; GFX12-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-PACKED-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
index f52461b6..2dc346a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
@@ -29,12 +29,12 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
;
; GFX8GISEL-LABEL: uniform_value:
; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
@@ -83,12 +83,12 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-LABEL: uniform_value:
; GFX1164DAGISEL: ; %bb.0: ; %entry
; GFX1164DAGISEL-NEXT: s_clause 0x1
-; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1164DAGISEL-NEXT: s_nop 0
; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164DAGISEL-NEXT: s_endpgm
@@ -96,12 +96,12 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-LABEL: uniform_value:
; GFX1164GISEL: ; %bb.0: ; %entry
; GFX1164GISEL-NEXT: s_clause 0x1
-; GFX1164GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164GISEL-NEXT: s_nop 0
; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164GISEL-NEXT: s_endpgm
@@ -109,11 +109,11 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-LABEL: uniform_value:
; GFX1132DAGISEL: ; %bb.0: ; %entry
; GFX1132DAGISEL-NEXT: s_clause 0x1
-; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1132DAGISEL-NEXT: s_nop 0
; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132DAGISEL-NEXT: s_endpgm
@@ -121,11 +121,11 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-LABEL: uniform_value:
; GFX1132GISEL: ; %bb.0: ; %entry
; GFX1132GISEL-NEXT: s_clause 0x1
-; GFX1132GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1132GISEL-NEXT: s_nop 0
; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132GISEL-NEXT: s_endpgm
@@ -138,98 +138,98 @@ entry:
define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
; GFX8DAGISEL-LABEL: const_value:
; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8DAGISEL-NEXT: s_endpgm
;
; GFX8GISEL-LABEL: const_value:
; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
; GFX9DAGISEL-LABEL: const_value:
; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9DAGISEL-NEXT: s_endpgm
;
; GFX9GISEL-LABEL: const_value:
; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9GISEL-NEXT: s_endpgm
;
; GFX10DAGISEL-LABEL: const_value:
; GFX10DAGISEL: ; %bb.0: ; %entry
-; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10DAGISEL-NEXT: s_endpgm
;
; GFX10GISEL-LABEL: const_value:
; GFX10GISEL: ; %bb.0: ; %entry
-; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10GISEL-NEXT: s_endpgm
;
; GFX1164DAGISEL-LABEL: const_value:
; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1164DAGISEL-NEXT: s_nop 0
; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164DAGISEL-NEXT: s_endpgm
;
; GFX1164GISEL-LABEL: const_value:
; GFX1164GISEL: ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164GISEL-NEXT: s_nop 0
; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164GISEL-NEXT: s_endpgm
;
; GFX1132DAGISEL-LABEL: const_value:
; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1132DAGISEL-NEXT: s_nop 0
; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132DAGISEL-NEXT: s_endpgm
;
; GFX1132GISEL-LABEL: const_value:
; GFX1132GISEL: ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1132GISEL-NEXT: s_nop 0
; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132GISEL-NEXT: s_endpgm
@@ -280,241 +280,241 @@ entry:
define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-LABEL: divergent_value:
; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX8DAGISEL-NEXT: s_max_u32 s4, s4, s6
-; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8DAGISEL-NEXT: ; %bb.2:
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8DAGISEL-NEXT: s_endpgm
;
; GFX8GISEL-LABEL: divergent_value:
; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX8GISEL-NEXT: s_mov_b32 s4, 0
; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX8GISEL-NEXT: s_max_u32 s4, s4, s6
-; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8GISEL-NEXT: ; %bb.2:
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
; GFX9DAGISEL-LABEL: divergent_value:
; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX9DAGISEL-NEXT: s_max_u32 s4, s4, s6
-; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9DAGISEL-NEXT: ; %bb.2:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9DAGISEL-NEXT: s_endpgm
;
; GFX9GISEL-LABEL: divergent_value:
; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX9GISEL-NEXT: s_mov_b32 s4, 0
; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX9GISEL-NEXT: s_max_u32 s4, s4, s6
-; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9GISEL-NEXT: ; %bb.2:
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9GISEL-NEXT: s_endpgm
;
; GFX1064DAGISEL-LABEL: divergent_value:
; GFX1064DAGISEL: ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX1064DAGISEL-NEXT: s_max_u32 s4, s4, s6
-; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064DAGISEL-NEXT: ; %bb.2:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1064DAGISEL-NEXT: s_endpgm
;
; GFX1064GISEL-LABEL: divergent_value:
; GFX1064GISEL: ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1064GISEL-NEXT: s_mov_b32 s4, 0
; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX1064GISEL-NEXT: s_max_u32 s4, s4, s6
-; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064GISEL-NEXT: ; %bb.2:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1064GISEL-NEXT: s_endpgm
;
; GFX1032DAGISEL-LABEL: divergent_value:
; GFX1032DAGISEL: ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s0, 0
; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s1
; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1032DAGISEL-NEXT: s_max_u32 s2, s2, s5
-; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s1, s4
+; GFX1032DAGISEL-NEXT: s_max_u32 s0, s0, s5
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032DAGISEL-NEXT: ; %bb.2:
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1032DAGISEL-NEXT: s_endpgm
;
; GFX1032GISEL-LABEL: divergent_value:
; GFX1032GISEL: ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s0, 0
; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s1
; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1032GISEL-NEXT: s_max_u32 s2, s2, s5
-; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT: s_bitset0_b32 s1, s4
+; GFX1032GISEL-NEXT: s_max_u32 s0, s0, s5
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032GISEL-NEXT: ; %bb.2:
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1032GISEL-NEXT: s_endpgm
;
; GFX1164DAGISEL-LABEL: divergent_value:
; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[0:1]
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX1164DAGISEL-NEXT: s_max_u32 s4, s4, s6
-; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164DAGISEL-NEXT: ; %bb.2:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164DAGISEL-NEXT: s_nop 0
; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164DAGISEL-NEXT: s_endpgm
;
; GFX1164GISEL-LABEL: divergent_value:
; GFX1164GISEL: ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1164GISEL-NEXT: s_mov_b32 s4, 0
; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[0:1]
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX1164GISEL-NEXT: s_max_u32 s4, s4, s6
-; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164GISEL-NEXT: ; %bb.2:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164GISEL-NEXT: s_nop 0
; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164GISEL-NEXT: s_endpgm
;
; GFX1132DAGISEL-LABEL: divergent_value:
; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, 0
; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132DAGISEL-NEXT: s_max_u32 s2, s2, s5
-; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s1, s4
+; GFX1132DAGISEL-NEXT: s_max_u32 s0, s0, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132DAGISEL-NEXT: ; %bb.2:
-; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1132DAGISEL-NEXT: s_nop 0
; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132DAGISEL-NEXT: s_endpgm
;
; GFX1132GISEL-LABEL: divergent_value:
; GFX1132GISEL: ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132GISEL-NEXT: s_max_u32 s2, s2, s5
-; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT: s_bitset0_b32 s1, s4
+; GFX1132GISEL-NEXT: s_max_u32 s0, s0, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132GISEL-NEXT: ; %bb.2:
-; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1132GISEL-NEXT: s_nop 0
; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132GISEL-NEXT: s_endpgm
@@ -556,10 +556,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s3
; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
; GFX8DAGISEL-NEXT: s_endpgm
;
@@ -590,11 +590,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX8GISEL-NEXT: .LBB4_5: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
@@ -628,10 +628,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9DAGISEL-NEXT: s_endpgm
;
; GFX9GISEL-LABEL: divergent_cfg:
@@ -661,11 +661,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX9GISEL-NEXT: .LBB4_5: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9GISEL-NEXT: s_endpgm
;
; GFX1064DAGISEL-LABEL: divergent_cfg:
@@ -698,10 +698,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1064DAGISEL-NEXT: s_endpgm
;
; GFX1064GISEL-LABEL: divergent_cfg:
@@ -731,11 +731,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1064GISEL-NEXT: s_endpgm
;
; GFX1032DAGISEL-LABEL: divergent_cfg:
@@ -768,10 +768,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1032DAGISEL-NEXT: s_endpgm
;
; GFX1032GISEL-LABEL: divergent_cfg:
@@ -801,11 +801,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX1032GISEL-NEXT: s_endpgm
;
; GFX1164DAGISEL-LABEL: divergent_cfg:
@@ -839,10 +839,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1164DAGISEL-NEXT: s_nop 0
; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164DAGISEL-NEXT: s_endpgm
@@ -875,11 +875,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164GISEL-NEXT: s_nop 0
; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164GISEL-NEXT: s_endpgm
@@ -915,10 +915,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1132DAGISEL-NEXT: s_nop 0
; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132DAGISEL-NEXT: s_endpgm
@@ -951,10 +951,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX1132GISEL-NEXT: s_nop 0
; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
index bfdb2da..bfae6f0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
@@ -30,12 +30,12 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
;
; GFX8GISEL-LABEL: uniform_value:
; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
@@ -84,12 +84,12 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-LABEL: uniform_value:
; GFX1164DAGISEL: ; %bb.0: ; %entry
; GFX1164DAGISEL-NEXT: s_clause 0x1
-; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1164DAGISEL-NEXT: s_nop 0
; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164DAGISEL-NEXT: s_endpgm
@@ -97,12 +97,12 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-LABEL: uniform_value:
; GFX1164GISEL: ; %bb.0: ; %entry
; GFX1164GISEL-NEXT: s_clause 0x1
-; GFX1164GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164GISEL-NEXT: s_nop 0
; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164GISEL-NEXT: s_endpgm
@@ -110,11 +110,11 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-LABEL: uniform_value:
; GFX1132DAGISEL: ; %bb.0: ; %entry
; GFX1132DAGISEL-NEXT: s_clause 0x1
-; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1132DAGISEL-NEXT: s_nop 0
; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132DAGISEL-NEXT: s_endpgm
@@ -122,11 +122,11 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-LABEL: uniform_value:
; GFX1132GISEL: ; %bb.0: ; %entry
; GFX1132GISEL-NEXT: s_clause 0x1
-; GFX1132GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1132GISEL-NEXT: s_nop 0
; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132GISEL-NEXT: s_endpgm
@@ -139,98 +139,98 @@ entry:
define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
; GFX8DAGISEL-LABEL: const_value:
; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8DAGISEL-NEXT: s_endpgm
;
; GFX8GISEL-LABEL: const_value:
; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
; GFX9DAGISEL-LABEL: const_value:
; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9DAGISEL-NEXT: s_endpgm
;
; GFX9GISEL-LABEL: const_value:
; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9GISEL-NEXT: s_endpgm
;
; GFX10DAGISEL-LABEL: const_value:
; GFX10DAGISEL: ; %bb.0: ; %entry
-; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10DAGISEL-NEXT: s_endpgm
;
; GFX10GISEL-LABEL: const_value:
; GFX10GISEL: ; %bb.0: ; %entry
-; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10GISEL-NEXT: s_endpgm
;
; GFX1164DAGISEL-LABEL: const_value:
; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1164DAGISEL-NEXT: s_nop 0
; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164DAGISEL-NEXT: s_endpgm
;
; GFX1164GISEL-LABEL: const_value:
; GFX1164GISEL: ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164GISEL-NEXT: s_nop 0
; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164GISEL-NEXT: s_endpgm
;
; GFX1132DAGISEL-LABEL: const_value:
; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1132DAGISEL-NEXT: s_nop 0
; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132DAGISEL-NEXT: s_endpgm
;
; GFX1132GISEL-LABEL: const_value:
; GFX1132GISEL: ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1132GISEL-NEXT: s_nop 0
; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132GISEL-NEXT: s_endpgm
@@ -281,241 +281,241 @@ entry:
define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX8DAGISEL-LABEL: divergent_value:
; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX8DAGISEL-NEXT: s_mov_b32 s4, -1
; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX8DAGISEL-NEXT: s_min_u32 s4, s4, s6
-; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8DAGISEL-NEXT: ; %bb.2:
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8DAGISEL-NEXT: s_endpgm
;
; GFX8GISEL-LABEL: divergent_value:
; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX8GISEL-NEXT: s_mov_b32 s4, -1
; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX8GISEL-NEXT: s_min_u32 s4, s4, s6
-; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8GISEL-NEXT: ; %bb.2:
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
; GFX9DAGISEL-LABEL: divergent_value:
; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX9DAGISEL-NEXT: s_mov_b32 s4, -1
; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX9DAGISEL-NEXT: s_min_u32 s4, s4, s6
-; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9DAGISEL-NEXT: ; %bb.2:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9DAGISEL-NEXT: s_endpgm
;
; GFX9GISEL-LABEL: divergent_value:
; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX9GISEL-NEXT: s_mov_b32 s4, -1
; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX9GISEL-NEXT: s_min_u32 s4, s4, s6
-; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9GISEL-NEXT: ; %bb.2:
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9GISEL-NEXT: s_endpgm
;
; GFX1064DAGISEL-LABEL: divergent_value:
; GFX1064DAGISEL: ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1064DAGISEL-NEXT: s_mov_b32 s4, -1
; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX1064DAGISEL-NEXT: s_min_u32 s4, s4, s6
-; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064DAGISEL-NEXT: ; %bb.2:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1064DAGISEL-NEXT: s_endpgm
;
; GFX1064GISEL-LABEL: divergent_value:
; GFX1064GISEL: ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1064GISEL-NEXT: s_mov_b32 s4, -1
; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX1064GISEL-NEXT: s_min_u32 s4, s4, s6
-; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064GISEL-NEXT: ; %bb.2:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1064GISEL-NEXT: s_endpgm
;
; GFX1032DAGISEL-LABEL: divergent_value:
; GFX1032DAGISEL: ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032DAGISEL-NEXT: s_mov_b32 s2, -1
+; GFX1032DAGISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s0, -1
; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s1
; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1032DAGISEL-NEXT: s_min_u32 s2, s2, s5
-; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s1, s4
+; GFX1032DAGISEL-NEXT: s_min_u32 s0, s0, s5
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032DAGISEL-NEXT: ; %bb.2:
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1032DAGISEL-NEXT: s_endpgm
;
; GFX1032GISEL-LABEL: divergent_value:
; GFX1032GISEL: ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032GISEL-NEXT: s_mov_b32 s2, -1
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s0, -1
; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s1
; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1032GISEL-NEXT: s_min_u32 s2, s2, s5
-; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT: s_bitset0_b32 s1, s4
+; GFX1032GISEL-NEXT: s_min_u32 s0, s0, s5
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032GISEL-NEXT: ; %bb.2:
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1032GISEL-NEXT: s_endpgm
;
; GFX1164DAGISEL-LABEL: divergent_value:
; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1164DAGISEL-NEXT: s_mov_b32 s4, -1
; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[0:1]
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX1164DAGISEL-NEXT: s_min_u32 s4, s4, s6
-; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164DAGISEL-NEXT: ; %bb.2:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164DAGISEL-NEXT: s_nop 0
; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164DAGISEL-NEXT: s_endpgm
;
; GFX1164GISEL-LABEL: divergent_value:
; GFX1164GISEL: ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1164GISEL-NEXT: s_mov_b32 s4, -1
; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[0:1]
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX1164GISEL-NEXT: s_min_u32 s4, s4, s6
-; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164GISEL-NEXT: ; %bb.2:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164GISEL-NEXT: s_nop 0
; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164GISEL-NEXT: s_endpgm
;
; GFX1132DAGISEL-LABEL: divergent_value:
; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132DAGISEL-NEXT: s_mov_b32 s2, -1
+; GFX1132DAGISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, -1
; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132DAGISEL-NEXT: s_min_u32 s2, s2, s5
-; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s1, s4
+; GFX1132DAGISEL-NEXT: s_min_u32 s0, s0, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132DAGISEL-NEXT: ; %bb.2:
-; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1132DAGISEL-NEXT: s_nop 0
; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132DAGISEL-NEXT: s_endpgm
;
; GFX1132GISEL-LABEL: divergent_value:
; GFX1132GISEL: ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132GISEL-NEXT: s_mov_b32 s2, -1
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s0, -1
; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132GISEL-NEXT: s_min_u32 s2, s2, s5
-; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT: s_bitset0_b32 s1, s4
+; GFX1132GISEL-NEXT: s_min_u32 s0, s0, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132GISEL-NEXT: ; %bb.2:
-; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1132GISEL-NEXT: s_nop 0
; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132GISEL-NEXT: s_endpgm
@@ -557,10 +557,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s3
; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
; GFX8DAGISEL-NEXT: s_endpgm
;
@@ -591,11 +591,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX8GISEL-NEXT: .LBB4_5: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
@@ -629,10 +629,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9DAGISEL-NEXT: s_endpgm
;
; GFX9GISEL-LABEL: divergent_cfg:
@@ -662,11 +662,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX9GISEL-NEXT: .LBB4_5: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9GISEL-NEXT: s_endpgm
;
; GFX1064DAGISEL-LABEL: divergent_cfg:
@@ -699,10 +699,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1064DAGISEL-NEXT: s_endpgm
;
; GFX1064GISEL-LABEL: divergent_cfg:
@@ -732,11 +732,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1064GISEL-NEXT: s_endpgm
;
; GFX1032DAGISEL-LABEL: divergent_cfg:
@@ -769,10 +769,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1032DAGISEL-NEXT: s_endpgm
;
; GFX1032GISEL-LABEL: divergent_cfg:
@@ -802,11 +802,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX1032GISEL-NEXT: s_endpgm
;
; GFX1164DAGISEL-LABEL: divergent_cfg:
@@ -840,10 +840,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1164DAGISEL-NEXT: s_nop 0
; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164DAGISEL-NEXT: s_endpgm
@@ -876,11 +876,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164GISEL-NEXT: s_nop 0
; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164GISEL-NEXT: s_endpgm
@@ -916,10 +916,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1132DAGISEL-NEXT: s_nop 0
; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132DAGISEL-NEXT: s_endpgm
@@ -952,10 +952,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX1132GISEL-NEXT: s_nop 0
; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
index 3eb2261..e034076 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
@@ -5,34 +5,34 @@
define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test1_s_barrier_signal:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v2, s[2:3]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal -1
; GCN-NEXT: s_barrier_wait -1
-; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test1_s_barrier_signal:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal -1
; GLOBAL-ISEL-NEXT: s_barrier_wait -1
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -51,34 +51,34 @@ entry:
define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test2_s_barrier_signal:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v2, s[2:3]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal 1
; GCN-NEXT: s_barrier_wait 1
-; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test2_s_barrier_signal:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal 1
; GLOBAL-ISEL-NEXT: s_barrier_wait 1
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -97,34 +97,34 @@ entry:
define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test3_s_barrier_signal:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v2, s[2:3]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal 0
; GCN-NEXT: s_barrier_wait 0
-; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test3_s_barrier_signal:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal 0
; GLOBAL-ISEL-NEXT: s_barrier_wait 0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -143,7 +143,7 @@ entry:
define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test1_s_barrier_signal_var:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0
@@ -151,29 +151,29 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v3, v1, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v1, s[2:3]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal m0
; GCN-NEXT: s_barrier_wait 1
-; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_var:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal m0
; GLOBAL-ISEL-NEXT: s_barrier_wait 1
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -222,43 +222,43 @@ define void @test2_s_barrier_signal_var(i32 %arg) {
define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
; GCN-LABEL: test1_s_barrier_signal_isfirst:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GCN-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal_isfirst -1
-; GCN-NEXT: s_cselect_b32 s3, s3, s5
-; GCN-NEXT: s_cselect_b32 s2, s2, s4
+; GCN-NEXT: s_cselect_b32 s1, s7, s9
+; GCN-NEXT: s_cselect_b32 s0, s6, s8
; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: global_load_b32 v2, v1, s[0:1]
-; GCN-NEXT: global_load_b32 v1, v1, s[2:3]
+; GCN-NEXT: global_load_b32 v2, v1, s[4:5]
+; GCN-NEXT: global_load_b32 v1, v1, s[0:1]
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_mul_lo_u32 v1, v1, v2
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_isfirst:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst -1
-; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b32 s0, 1, 0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1
-; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0
-; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GLOBAL-ISEL-NEXT: s_and_b32 s0, s0, 1
+; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b64 s[0:1], s[6:7], s[8:9]
; GLOBAL-ISEL-NEXT: s_clause 0x1
-; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1]
-; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[4:5]
+; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[0:1]
; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -278,43 +278,43 @@ entry:
define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
; GCN-LABEL: test2_s_barrier_signal_isfirst:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GCN-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal_isfirst 1
-; GCN-NEXT: s_cselect_b32 s3, s3, s5
-; GCN-NEXT: s_cselect_b32 s2, s2, s4
+; GCN-NEXT: s_cselect_b32 s1, s7, s9
+; GCN-NEXT: s_cselect_b32 s0, s6, s8
; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: global_load_b32 v2, v1, s[0:1]
-; GCN-NEXT: global_load_b32 v1, v1, s[2:3]
+; GCN-NEXT: global_load_b32 v2, v1, s[4:5]
+; GCN-NEXT: global_load_b32 v1, v1, s[0:1]
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_mul_lo_u32 v1, v1, v2
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_isfirst:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst 1
-; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b32 s0, 1, 0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1
-; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0
-; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GLOBAL-ISEL-NEXT: s_and_b32 s0, s0, 1
+; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b64 s[0:1], s[6:7], s[8:9]
; GLOBAL-ISEL-NEXT: s_clause 0x1
-; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1]
-; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[4:5]
+; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[0:1]
; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -334,43 +334,43 @@ entry:
define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
; GCN-LABEL: test3_s_barrier_signal_isfirst:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GCN-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal_isfirst 1
-; GCN-NEXT: s_cselect_b32 s3, s3, s5
-; GCN-NEXT: s_cselect_b32 s2, s2, s4
+; GCN-NEXT: s_cselect_b32 s1, s7, s9
+; GCN-NEXT: s_cselect_b32 s0, s6, s8
; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: global_load_b32 v2, v1, s[0:1]
-; GCN-NEXT: global_load_b32 v1, v1, s[2:3]
+; GCN-NEXT: global_load_b32 v2, v1, s[4:5]
+; GCN-NEXT: global_load_b32 v1, v1, s[0:1]
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_mul_lo_u32 v1, v1, v2
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test3_s_barrier_signal_isfirst:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst 1
-; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b32 s0, 1, 0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1
-; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0
-; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GLOBAL-ISEL-NEXT: s_and_b32 s0, s0, 1
+; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b64 s[0:1], s[6:7], s[8:9]
; GLOBAL-ISEL-NEXT: s_clause 0x1
-; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1]
-; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[4:5]
+; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[0:1]
; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -390,45 +390,45 @@ entry:
define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
; GCN-LABEL: test1_s_barrier_signal_isfirst_var:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GCN-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GCN-NEXT: s_mov_b32 m0, 1
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal_isfirst m0
-; GCN-NEXT: s_cselect_b32 s3, s3, s5
-; GCN-NEXT: s_cselect_b32 s2, s2, s4
+; GCN-NEXT: s_cselect_b32 s1, s7, s9
+; GCN-NEXT: s_cselect_b32 s0, s6, s8
; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: global_load_b32 v2, v1, s[0:1]
-; GCN-NEXT: global_load_b32 v1, v1, s[2:3]
+; GCN-NEXT: global_load_b32 v2, v1, s[4:5]
+; GCN-NEXT: global_load_b32 v1, v1, s[0:1]
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_mul_lo_u32 v1, v1, v2
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_isfirst_var:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst m0
-; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b32 s0, 1, 0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1
-; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0
-; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GLOBAL-ISEL-NEXT: s_and_b32 s0, s0, 1
+; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b64 s[0:1], s[6:7], s[8:9]
; GLOBAL-ISEL-NEXT: s_clause 0x1
-; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1]
-; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[4:5]
+; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[0:1]
; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -732,29 +732,29 @@ define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) {
define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test1_s_barrier_join:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GCN-NEXT: s_barrier_join -1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v2, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v2, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test1_s_barrier_join:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_barrier_join -1
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -772,29 +772,29 @@ entry:
define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test2_s_barrier_join:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GCN-NEXT: s_barrier_join 1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v2, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v2, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test2_s_barrier_join:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_barrier_join 1
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -812,29 +812,29 @@ entry:
define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test3_s_barrier_join:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GCN-NEXT: s_barrier_join 0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v2, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v2, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test3_s_barrier_join:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_barrier_join 0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -924,41 +924,41 @@ define void @test5_s_barrier_join_m0(i32 %arg) {
define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
; GCN-LABEL: test1_s_barrier_leave:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GCN-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_barrier_leave
-; GCN-NEXT: s_cselect_b32 s3, s3, s5
-; GCN-NEXT: s_cselect_b32 s2, s2, s4
+; GCN-NEXT: s_cselect_b32 s1, s7, s9
+; GCN-NEXT: s_cselect_b32 s0, s6, s8
; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: global_load_b32 v2, v1, s[0:1]
-; GCN-NEXT: global_load_b32 v1, v1, s[2:3]
+; GCN-NEXT: global_load_b32 v2, v1, s[4:5]
+; GCN-NEXT: global_load_b32 v1, v1, s[0:1]
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_mul_lo_u32 v1, v1, v2
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test1_s_barrier_leave:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_barrier_leave
-; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b32 s0, 1, 0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1
-; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0
-; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GLOBAL-ISEL-NEXT: s_and_b32 s0, s0, 1
+; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b64 s[0:1], s[6:7], s[8:9]
; GLOBAL-ISEL-NEXT: s_clause 0x1
-; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1]
-; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[4:5]
+; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[0:1]
; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -978,29 +978,29 @@ entry:
define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test1_s_wakeup_barrier:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GCN-NEXT: s_wakeup_barrier -1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v2, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v2, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test1_s_wakeup_barrier:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_wakeup_barrier -1
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -1018,29 +1018,29 @@ entry:
define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test2_s_wakeup_barrier:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GCN-NEXT: s_wakeup_barrier 1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v2, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v2, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test2_s_wakeup_barrier:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_wakeup_barrier 1
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -1058,29 +1058,29 @@ entry:
define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test3_s_wakeup_barrier:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GCN-NEXT: s_wakeup_barrier 0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v2, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v2, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test3_s_wakeup_barrier:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_wakeup_barrier 0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -1170,27 +1170,27 @@ define void @test5_s_wakeup_barrier_m0(i32 %arg) {
define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test1_s_get_barrier_state:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_get_barrier_state s2, -1
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_get_barrier_state s4, -1
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0
-; GCN-NEXT: global_store_b32 v0, v1, s[0:1]
+; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v0, 2, v0
+; GCN-NEXT: global_store_b32 v0, v1, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test1_s_get_barrier_state:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, -1
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: s_get_barrier_state s0, -1
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
-; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s0
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -1206,27 +1206,27 @@ entry:
define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test2_s_get_barrier_state:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_get_barrier_state s2, 1
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_get_barrier_state s4, 1
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0
-; GCN-NEXT: global_store_b32 v0, v1, s[0:1]
+; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v0, 2, v0
+; GCN-NEXT: global_store_b32 v0, v1, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test2_s_get_barrier_state:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, 1
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: s_get_barrier_state s0, 1
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
-; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s0
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -1242,27 +1242,27 @@ entry:
define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test3_s_get_barrier_state:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_get_barrier_state s2, 0
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_get_barrier_state s4, 0
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0
-; GCN-NEXT: global_store_b32 v0, v1, s[0:1]
+; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v0, 2, v0
+; GCN-NEXT: global_store_b32 v0, v1, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test3_s_get_barrier_state:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, 0
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: s_get_barrier_state s0, 0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
-; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s0
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -1352,34 +1352,34 @@ define i32 @test5_s_get_barrier_state_m0(i32 %arg) {
define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test_barrier_convert:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v2, s[2:3]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal -1
; GCN-NEXT: s_barrier_wait -1
-; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test_barrier_convert:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal -1
; GLOBAL-ISEL-NEXT: s_barrier_wait -1
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
index eb30484..3883b3a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
@@ -5,22 +5,22 @@
define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) {
; GFX11-SDAG-LABEL: test_get_doorbell:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL)
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_get_doorbell:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL)
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -32,22 +32,22 @@ define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) {
; GFX11-SDAG-LABEL: test_get_ddid:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID)
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DDID)
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_get_ddid:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID)
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DDID)
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -59,12 +59,12 @@ define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) {
; GFX11-LABEL: test_get_tma:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TMA)
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_sendmsg_rtn_b64 s[0:1], sendmsg(MSG_RTN_GET_TMA)
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -76,12 +76,12 @@ define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_get_realtime(ptr addrspace(1) %out) {
; GFX11-LABEL: test_get_realtime:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_REALTIME)
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_sendmsg_rtn_b64 s[0:1], sendmsg(MSG_RTN_GET_REALTIME)
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -93,22 +93,22 @@ define amdgpu_kernel void @test_get_realtime(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) {
; GFX11-SDAG-LABEL: test_savewave:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE)
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_SAVE_WAVE)
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_savewave:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE)
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_SAVE_WAVE)
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -120,12 +120,12 @@ define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_get_tba(ptr addrspace(1) %out) {
; GFX11-LABEL: test_get_tba:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TBA)
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_sendmsg_rtn_b64 s[0:1], sendmsg(MSG_RTN_GET_TBA)
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -137,22 +137,22 @@ define amdgpu_kernel void @test_get_tba(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) {
; GFX11-SDAG-LABEL: test_get_0_i32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0)
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(0, 0, 0)
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_get_0_i32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0)
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(0, 0, 0)
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -164,12 +164,12 @@ define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_get_99999_i64(ptr addrspace(1) %out) {
; GFX11-LABEL: test_get_99999_i64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], 99999
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_sendmsg_rtn_b64 s[0:1], 99999
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index 8f8994e..2c5efd3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -5,16 +5,16 @@
define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
; GCN-LABEL: set_inactive:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 42
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
store i32 %tmp, ptr addrspace(1) %out
@@ -24,12 +24,12 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) {
; GCN-LABEL: set_inactive_imm_poison:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: v_mov_b32_e32 v0, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0
store i32 %tmp, ptr addrspace(1) %out
@@ -39,19 +39,19 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) {
define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
; GCN-LABEL: set_inactive_64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: s_mov_b32 s4, s0
-; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b32 s0, s4
+; GCN-NEXT: s_mov_b32 s1, s5
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
store i64 %tmp, ptr addrspace(1) %out
@@ -61,13 +61,13 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) {
; GCN-LABEL: set_inactive_imm_poison_64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 1
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0
store i64 %tmp, ptr addrspace(1) %out
@@ -81,30 +81,30 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_buffer_load_dword s3, s[4:7], 0x0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 42
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s3, 56
-; GCN-NEXT: s_mov_b64 s[2:3], -1
+; GCN-NEXT: s_mov_b64 s[0:1], -1
; GCN-NEXT: s_cbranch_scc1 .LBB4_3
; GCN-NEXT: ; %bb.1: ; %Flow
-; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3]
+; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1]
; GCN-NEXT: s_cbranch_vccz .LBB4_4
; GCN-NEXT: .LBB4_2: ; %.exit
; GCN-NEXT: s_endpgm
; GCN-NEXT: .LBB4_3: ; %.one
; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: buffer_store_dword v1, off, s[4:7], 0
; GCN-NEXT: s_cbranch_execnz .LBB4_2
; GCN-NEXT: .LBB4_4: ; %.zero
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0)
%cmp = icmp eq i32 %val, 56
@@ -127,17 +127,17 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
; GCN-LABEL: set_inactive_f32:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s5, 0x40400000
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s0, 0x40400000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
store float %tmp, ptr addrspace(1) %out
@@ -147,21 +147,21 @@ define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
; GCN-LABEL: set_inactive_f64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s4, s0
-; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: s_mov_b32 s0, 0xcccccccd
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: s_mov_b32 s1, 0x4010cccc
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_mov_b32 s0, s4
+; GCN-NEXT: s_mov_b32 s1, s5
+; GCN-NEXT: s_mov_b32 s4, 0xcccccccd
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b32 s5, 0x4010cccc
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
store double %tmp, ptr addrspace(1) %out
@@ -171,17 +171,17 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
; GCN-LABEL: set_inactive_v2i16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s5, 0x10001
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s0, 0x10001
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
store <2 x i16> %tmp, ptr addrspace(1) %out
@@ -191,17 +191,17 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
; GCN-LABEL: set_inactive_v2f16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s5, 0x3c003c00
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s0, 0x3c003c00
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
store <2 x half> %tmp, ptr addrspace(1) %out
@@ -259,17 +259,17 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
; GCN-LABEL: set_inactive_v2bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s5, 0x3f803f80
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s0, 0x3f803f80
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
store <2 x bfloat> %tmp, ptr addrspace(1) %out
@@ -351,19 +351,19 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa
define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
; GCN-LABEL: set_inactive_p0:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: s_mov_b32 s4, s0
-; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b32 s0, s4
+; GCN-NEXT: s_mov_b32 s1, s5
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
store ptr %tmp, ptr addrspace(1) %out
@@ -373,16 +373,16 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
; GCN-LABEL: set_inactive_p2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
store ptr addrspace(2) %tmp, ptr addrspace(1) %out
@@ -392,16 +392,16 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
; GCN-LABEL: set_inactive_p3:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
store ptr addrspace(3) %tmp, ptr addrspace(1) %out
@@ -411,16 +411,16 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
; GCN-LABEL: set_inactive_p5:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
store ptr addrspace(5) %tmp, ptr addrspace(1) %out
@@ -430,16 +430,16 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
; GCN-LABEL: set_inactive_p6:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
store ptr addrspace(6) %tmp, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
index 87c5f5b..7bcafea 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
@@ -147,12 +147,12 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out,
;
; VI-LABEL: bfe_u32_arg_0_width_reg_offset:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 0)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -172,12 +172,12 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out,
;
; VI-LABEL: bfe_u32_arg_0_width_imm_offset:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 8, i32 0)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -204,19 +204,19 @@ define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrsp
;
; VI-LABEL: bfe_u32_zextload_i8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in
%ext = zext i8 %load to i32
@@ -248,21 +248,21 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr add
;
; VI-LABEL: bfe_u32_zext_in_reg_i8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
; VI-NEXT: v_and_b32_e32 v0, 0xff, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%load = load i32, ptr addrspace(1) %in, align 4
%add = add i32 %load, 1
@@ -294,21 +294,21 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr ad
;
; VI-LABEL: bfe_u32_zext_in_reg_i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%load = load i32, ptr addrspace(1) %in, align 4
%add = add i32 %load, 1
@@ -341,22 +341,22 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out
;
; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
; VI-NEXT: v_and_b32_e32 v0, 0xfe, v0
; VI-NEXT: v_bfe_u32 v0, v0, 1, 8
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%load = load i32, ptr addrspace(1) %in, align 4
%add = add i32 %load, 1
@@ -389,22 +389,22 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out
;
; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_3:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
; VI-NEXT: v_and_b32_e32 v0, 0xf8, v0
; VI-NEXT: v_bfe_u32 v0, v0, 3, 8
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%load = load i32, ptr addrspace(1) %in, align 4
%add = add i32 %load, 1
@@ -437,22 +437,22 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out
;
; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_7:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
; VI-NEXT: v_and_b32_e32 v0, 0x80, v0
; VI-NEXT: v_bfe_u32 v0, v0, 7, 8
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%load = load i32, ptr addrspace(1) %in, align 4
%add = add i32 %load, 1
@@ -484,21 +484,21 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %ou
;
; VI-LABEL: bfe_u32_zext_in_reg_i16_offset_8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
; VI-NEXT: v_bfe_u32 v0, v0, 8, 8
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%load = load i32, ptr addrspace(1) %in, align 4
%add = add i32 %load, 1
@@ -529,20 +529,20 @@ define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v0, 1, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 0, i32 1)
@@ -563,12 +563,12 @@ define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_2:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = shl i32 %x, 31
@@ -590,12 +590,12 @@ define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_3:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = shl i32 %x, 31
@@ -617,12 +617,12 @@ define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_4:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = shl i32 %x, 31
@@ -653,20 +653,20 @@ define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_5:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_bfe_i32 v0, v0, 0, 1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = shl i32 %x, 31
@@ -698,21 +698,21 @@ define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_6:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v0, 30, v0
; VI-NEXT: v_and_b32_e32 v0, 2.0, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = shl i32 %x, 31
@@ -742,20 +742,20 @@ define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_7:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v0, 31, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = shl i32 %x, 31
@@ -785,20 +785,20 @@ define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v0, 1, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = shl i32 %x, 31
@@ -828,20 +828,20 @@ define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_9:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v0, 31, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 31, i32 1)
@@ -870,20 +870,20 @@ define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace(
;
; VI-LABEL: bfe_u32_test_10:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 1, i32 31)
@@ -912,20 +912,20 @@ define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace(
;
; VI-LABEL: bfe_u32_test_11:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 8, i32 24)
@@ -954,20 +954,20 @@ define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace(
;
; VI-LABEL: bfe_u32_test_12:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v0, 24, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 24, i32 8)
@@ -997,20 +997,20 @@ define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace(
;
; VI-LABEL: bfe_u32_test_13:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v0, 31, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = ashr i32 %x, 31
@@ -1031,12 +1031,12 @@ define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace(
;
; VI-LABEL: bfe_u32_test_14:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = lshr i32 %x, 31
@@ -1057,12 +1057,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 0)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1082,12 +1082,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 12334, i32 0, i32 0)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1107,12 +1107,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_2:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 1)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1132,12 +1132,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_3:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 1, i32 0, i32 1)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1157,12 +1157,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_4:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 0, i32 1)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1182,12 +1182,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_5:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 7, i32 1)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1207,12 +1207,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_6:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0x80
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 0, i32 8)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1232,12 +1232,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_7:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0x7f
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 0, i32 8)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1257,12 +1257,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 6, i32 8)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1282,12 +1282,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_9:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65536, i32 16, i32 8)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1307,12 +1307,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_10:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65535, i32 16, i32 16)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1332,12 +1332,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_11:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 10
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 4)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1357,12 +1357,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_12:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 31, i32 1)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1382,12 +1382,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_13:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 131070, i32 16, i32 16)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1407,12 +1407,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_14:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 40
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 2, i32 30)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1432,12 +1432,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_15:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 10
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 28)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1457,12 +1457,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0x7f
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 1, i32 7)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1482,12 +1482,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_17:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0x7f
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 1, i32 31)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1507,12 +1507,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_18:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 31, i32 1)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1593,14 +1593,14 @@ define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 {
;
; VI-LABEL: lshr_and:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_u32 s4, s4, 0x30006
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_bfe_u32 s0, s2, 0x30006
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%b = lshr i32 %a, 6
%c = and i32 %b, 7
@@ -1657,14 +1657,14 @@ define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 {
;
; VI-LABEL: and_lshr:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_u32 s4, s4, 0x30006
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_bfe_u32 s0, s2, 0x30006
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%b = and i32 %a, 448
%c = lshr i32 %b, 6
@@ -1687,14 +1687,14 @@ define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 {
;
; VI-LABEL: and_lshr2:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_u32 s4, s4, 0x30006
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_bfe_u32 s0, s2, 0x30006
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%b = and i32 %a, 511
%c = lshr i32 %b, 6
@@ -1717,14 +1717,14 @@ define amdgpu_kernel void @shl_lshr(ptr addrspace(1) %out, i32 %a) #0 {
;
; VI-LABEL: shl_lshr:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_u32 s4, s4, 0x150002
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_bfe_u32 s0, s2, 0x150002
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%b = shl i32 %a, 9
%c = lshr i32 %b, 11
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
index eeddb3d..7edac87 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
@@ -30,58 +30,58 @@ define amdgpu_kernel void @ceil_f16(
;
; VI-LABEL: ceil_f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ceil_f16_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: ceil_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_ceil_f16_e32 v0.l, v0.l
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: ceil_f16:
; GFX11-FAKE16: ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
-; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
-; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
-; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s7
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, s4
; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, s5
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_ceil_f16_e32 v0, v0
-; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FAKE16-NEXT: s_nop 0
; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-NEXT: s_endpgm
@@ -130,37 +130,37 @@ define amdgpu_kernel void @ceil_v2f16(
;
; VI-LABEL: ceil_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ceil_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_ceil_f16_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: ceil_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_ceil_f16_e32 v0.l, v0.l
@@ -172,31 +172,31 @@ define amdgpu_kernel void @ceil_v2f16(
; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: ceil_v2f16:
; GFX11-FAKE16: ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
-; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
-; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
-; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s7
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, s4
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, s5
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-FAKE16-NEXT: v_ceil_f16_e32 v0, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_ceil_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-FAKE16-NEXT: s_nop 0
; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index fcc4cb3..28d3e8f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -30,55 +30,55 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
;
; GFX8-LABEL: cos_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
; GFX8-NEXT: v_fract_f16_e32 v0, v0
; GFX8-NEXT: v_cos_f16_e32 v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: cos_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
; GFX9-NEXT: v_cos_f16_e32 v1, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: cos_f16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
; GFX10-NEXT: v_cos_f16_e32 v1, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: cos_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cos_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -121,10 +121,10 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
;
; GFX8-LABEL: cos_v2f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v1, 0x3118
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -134,50 +134,50 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX8-NEXT: v_fract_f16_e32 v0, v0
; GFX8-NEXT: v_cos_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_cos_f16_e32 v3, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: cos_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_cos_f16_e32 v2, v3
; GFX9-NEXT: v_cos_f16_e32 v1, v1
; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: cos_v2f16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_cos_f16_e32 v2, v3
; GFX10-NEXT: v_cos_f16_e32 v1, v1
; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: cos_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
@@ -188,7 +188,7 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX11-NEXT: v_cos_f16_e32 v2, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 4f65acd..d60e07d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -12,84 +12,86 @@
define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) {
; VI-SDAG-LABEL: s_exp_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_and_b32 s3, s2, 0xfffff000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_sub_f32_e32 v1, s2, v1
+; VI-SDAG-NEXT: s_and_b32 s2, s4, 0xfffff000
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; VI-SDAG-NEXT: v_sub_f32_e32 v1, s4, v1
; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x39a3b295, v1
; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s3, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v3
; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x39a3b295
; VI-SDAG-NEXT: v_rndne_f32_e32 v2, v0
-; VI-SDAG-NEXT: v_mul_f32_e32 v3, s3, v3
+; VI-SDAG-NEXT: v_mul_f32_e32 v3, s2, v3
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2
; VI-SDAG-NEXT: v_add_f32_e32 v1, v3, v1
; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1
; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v2
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0
-; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1
+; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42b17218
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1
+; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: s_exp_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8a000
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x39a3b295
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_and_b32 s3, s2, 0xfffff000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s3
-; VI-GISEL-NEXT: v_sub_f32_e32 v2, s2, v2
+; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_sub_f32_e32 v2, s4, v2
; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x39a3b295, v2
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, s3, v0
+; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, s3, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v1
; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
; VI-GISEL-NEXT: v_rndne_f32_e32 v2, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2
; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2
; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218
; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1
+; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
; GFX900-SDAG-LABEL: s_exp_f32:
; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0
; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2
-; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v2
+; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v2
; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3
-; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0
+; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0
; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0
; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v3
; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0
@@ -97,39 +99,39 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) {
; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0
-; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1
+; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42b17218
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1
+; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[2:3]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_exp_f32:
; GFX900-GISEL: ; %bb.0:
-; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX900-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f
-; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0
-; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0
+; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2
; GFX900-GISEL-NEXT: v_rndne_f32_e32 v3, v2
-; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0
+; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0
; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3
; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3
; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218
; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1
+; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
; SI-SDAG-LABEL: s_exp_f32:
@@ -853,7 +855,6 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
@@ -870,16 +871,17 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3
-; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000
-; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_and_b32 s0, s5, 0xfffff000
+; VI-SDAG-NEXT: v_mov_b32_e32 v7, s0
; VI-SDAG-NEXT: v_sub_f32_e32 v7, s5, v7
; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x39a3b295, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3fb8a000, v7
; VI-SDAG-NEXT: v_rndne_f32_e32 v6, v2
; VI-SDAG-NEXT: v_add_f32_e32 v7, v7, v8
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v8, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v6
; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7
; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v7
@@ -891,17 +893,17 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5
-; VI-SDAG-NEXT: s_and_b32 s2, s4, 0xfffff000
+; VI-SDAG-NEXT: s_and_b32 s0, s4, 0xfffff000
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc
; VI-SDAG-NEXT: v_ldexp_f32 v1, v7, v6
-; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v7, s0
; VI-SDAG-NEXT: v_sub_f32_e32 v7, s4, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x39a3b295, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3fb8a000, v7
; VI-SDAG-NEXT: v_rndne_f32_e32 v6, v0
; VI-SDAG-NEXT: v_add_f32_e32 v7, v7, v9
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v6
; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v7
; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
@@ -915,9 +917,10 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s3
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2
; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-SDAG-NEXT: s_endpgm
;
@@ -926,7 +929,6 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8a000
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x39a3b295
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
@@ -936,14 +938,15 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-GISEL-NEXT: v_mul_f32_e32 v3, s2, v1
; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4
; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v2
-; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s2
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_and_b32 s0, s5, 0xfffff000
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s0
; VI-GISEL-NEXT: v_sub_f32_e32 v5, s5, v5
; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x39a3b295, v5
; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3fb8a000, v5
-; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v6, s0, v1
; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v7
-; VI-GISEL-NEXT: v_mul_f32_e32 v7, s2, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v7, s0, v2
; VI-GISEL-NEXT: v_add_f32_e32 v5, v7, v5
; VI-GISEL-NEXT: v_rndne_f32_e32 v7, v6
; VI-GISEL-NEXT: v_sub_f32_e32 v6, v6, v7
@@ -952,19 +955,19 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-GISEL-NEXT: v_exp_f32_e32 v5, v5
; VI-GISEL-NEXT: v_add_f32_e32 v0, v4, v0
; VI-GISEL-NEXT: v_rndne_f32_e32 v4, v3
-; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000
+; VI-GISEL-NEXT: s_and_b32 s0, s6, 0xfffff000
; VI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_ldexp_f32 v5, v5, v6
-; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v6, s0
; VI-GISEL-NEXT: v_add_f32_e32 v0, v3, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4
; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x39a3b295, v6
; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3fb8a000, v6
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v1
; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v8
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, s0, v2
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v6
; VI-GISEL-NEXT: v_rndne_f32_e32 v6, v1
; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v6
@@ -987,19 +990,20 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v3
; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v4
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s3
; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s2
; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-GISEL-NEXT: s_endpgm
;
; GFX900-SDAG-LABEL: s_exp_v3f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f
; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x42b17218
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v0
; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6
@@ -1043,15 +1047,15 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5
; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; GFX900-SDAG-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX900-SDAG-NEXT: global_store_dwordx3 v4, v[0:2], s[2:3]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_exp_v3f32:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x32a5705f
-; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1
; GFX900-GISEL-NEXT: v_fma_f32 v6, s5, v1, -v5
@@ -1096,7 +1100,7 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v3
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
; SI-SDAG-LABEL: s_exp_v3f32:
@@ -1593,7 +1597,6 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000
; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: s_and_b32 s2, s7, 0xfffff000
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
@@ -1610,37 +1613,38 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3
-; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000
-; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_and_b32 s0, s6, 0xfffff000
+; VI-SDAG-NEXT: v_mov_b32_e32 v7, s0
; VI-SDAG-NEXT: v_sub_f32_e32 v7, s6, v7
; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x39a3b295, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3fb8a000, v7
; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v2
; VI-SDAG-NEXT: v_add_f32_e32 v7, v7, v8
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v8, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3
; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7
; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v7
; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v7, v3
; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0
-; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000
+; VI-SDAG-NEXT: s_and_b32 s0, s5, 0xfffff000
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v9, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v9, s0
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v6
; VI-SDAG-NEXT: v_sub_f32_e32 v9, s5, v9
; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v8, v1, vcc
; VI-SDAG-NEXT: v_ldexp_f32 v1, v2, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v10, 0x39a3b295, v9
; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3fb8a000, v9
; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v2
; VI-SDAG-NEXT: v_add_f32_e32 v9, v9, v10
-; VI-SDAG-NEXT: v_mul_f32_e32 v10, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v10, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v7
; VI-SDAG-NEXT: v_add_f32_e32 v9, v10, v9
; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v9
@@ -1649,17 +1653,17 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v5
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v6
-; VI-SDAG-NEXT: s_and_b32 s2, s4, 0xfffff000
+; VI-SDAG-NEXT: s_and_b32 s0, s4, 0xfffff000
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc
; VI-SDAG-NEXT: v_ldexp_f32 v1, v9, v7
-; VI-SDAG-NEXT: v_mov_b32_e32 v9, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v9, s0
; VI-SDAG-NEXT: v_sub_f32_e32 v9, s4, v9
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v10, 0x39a3b295, v9
; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3fb8a000, v9
; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v0
; VI-SDAG-NEXT: v_add_f32_e32 v9, v9, v10
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v7
; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v9
; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
@@ -1673,9 +1677,10 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v5
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s3
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s2
; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-SDAG-NEXT: s_endpgm
;
@@ -1685,7 +1690,6 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8a000
; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x39a3b295
; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x42b17218
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
@@ -1701,49 +1705,50 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v4
; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000
-; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_and_b32 s0, s5, 0xfffff000
+; VI-GISEL-NEXT: v_mul_f32_e32 v6, s0, v2
; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s0
; VI-GISEL-NEXT: v_sub_f32_e32 v1, s5, v1
; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x39a3b295, v1
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v7
-; VI-GISEL-NEXT: v_mul_f32_e32 v7, s2, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v7, s0, v3
; VI-GISEL-NEXT: v_add_f32_e32 v1, v7, v1
; VI-GISEL-NEXT: v_rndne_f32_e32 v7, v6
; VI-GISEL-NEXT: v_sub_f32_e32 v6, v6, v7
; VI-GISEL-NEXT: v_add_f32_e32 v1, v6, v1
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v6, v7
; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1
-; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000
-; VI-GISEL-NEXT: v_mul_f32_e32 v8, s2, v2
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4
+; VI-GISEL-NEXT: s_and_b32 s0, s6, 0xfffff000
+; VI-GISEL-NEXT: v_mul_f32_e32 v8, s0, v2
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0
; VI-GISEL-NEXT: v_ldexp_f32 v1, v1, v6
-; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v6, s0
; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6
; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x39a3b295, v6
; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3fb8a000, v6
; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v9
-; VI-GISEL-NEXT: v_mul_f32_e32 v9, s2, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v9, s0, v3
; VI-GISEL-NEXT: v_add_f32_e32 v6, v9, v6
; VI-GISEL-NEXT: v_rndne_f32_e32 v9, v8
; VI-GISEL-NEXT: v_sub_f32_e32 v8, v8, v9
; VI-GISEL-NEXT: v_add_f32_e32 v6, v8, v6
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v8, v9
; VI-GISEL-NEXT: v_exp_f32_e32 v6, v6
-; VI-GISEL-NEXT: s_and_b32 s2, s7, 0xfffff000
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, s2, v3
+; VI-GISEL-NEXT: s_and_b32 s0, s7, 0xfffff000
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, s0, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, s0, v3
; VI-GISEL-NEXT: v_ldexp_f32 v6, v6, v8
-; VI-GISEL-NEXT: v_mov_b32_e32 v8, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v8, s0
; VI-GISEL-NEXT: v_sub_f32_e32 v8, s7, v8
; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x39a3b295, v8
; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3fb8a000, v8
; VI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9
; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v8
; VI-GISEL-NEXT: v_rndne_f32_e32 v8, v2
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8
; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000
@@ -1764,19 +1769,20 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4
; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s3
; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s2
; VI-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-GISEL-NEXT: s_endpgm
;
; GFX900-SDAG-LABEL: s_exp_v4f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f
; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0
; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2
@@ -1787,8 +1793,8 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3
; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2
; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218
; GFX900-SDAG-NEXT: v_mov_b32_e32 v9, 0x7f800000
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3
; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0
; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v3
@@ -1833,17 +1839,16 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6
; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
-; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_exp_v4f32:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x32a5705f
; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x42b17218
-; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2
; GFX900-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0
@@ -1900,7 +1905,7 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
; SI-SDAG-LABEL: s_exp_v4f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index ff20f90..bd167dd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -14,84 +14,86 @@
define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) {
; VI-SDAG-LABEL: s_exp10_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_and_b32 s3, s2, 0xfffff000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_sub_f32_e32 v1, s2, v1
+; VI-SDAG-NEXT: s_and_b32 s2, s4, 0xfffff000
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; VI-SDAG-NEXT: v_sub_f32_e32 v1, s4, v1
; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v1
; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549000, v1
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s3, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v3
; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x3a2784bc
; VI-SDAG-NEXT: v_rndne_f32_e32 v2, v0
-; VI-SDAG-NEXT: v_mul_f32_e32 v3, s3, v3
+; VI-SDAG-NEXT: v_mul_f32_e32 v3, s2, v3
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2
; VI-SDAG-NEXT: v_add_f32_e32 v1, v3, v1
; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1
; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v2
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc23369f4
-; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1
+; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x421a209b
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1
+; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: s_exp10_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549000
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3a2784bc
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_and_b32 s3, s2, 0xfffff000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s3
-; VI-GISEL-NEXT: v_sub_f32_e32 v2, s2, v2
+; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_sub_f32_e32 v2, s4, v2
; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v2
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x40549000, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, s3, v0
+; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, s3, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v1
; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
; VI-GISEL-NEXT: v_rndne_f32_e32 v2, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2
; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2
; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc23369f4
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b
; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1
+; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
; GFX900-SDAG-LABEL: s_exp10_f32:
; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0
; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2
-; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v2
+; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v2
; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3
-; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0
+; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0
; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0
; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v3
; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0
@@ -99,39 +101,39 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) {
; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc23369f4
-; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1
+; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x421a209b
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1
+; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[2:3]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_exp10_f32:
; GFX900-GISEL: ; %bb.0:
-; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX900-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37
-; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0
-; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0
+; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2
; GFX900-GISEL-NEXT: v_rndne_f32_e32 v3, v2
-; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0
+; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0
; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3
; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3
; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc23369f4
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b
; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1
+; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
; SI-SDAG-LABEL: s_exp10_f32:
@@ -855,7 +857,6 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
@@ -872,16 +873,17 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3
-; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000
-; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_and_b32 s0, s5, 0xfffff000
+; VI-SDAG-NEXT: v_mov_b32_e32 v7, s0
; VI-SDAG-NEXT: v_sub_f32_e32 v7, s5, v7
; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3a2784bc, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x40549000, v7
; VI-SDAG-NEXT: v_rndne_f32_e32 v6, v2
; VI-SDAG-NEXT: v_add_f32_e32 v7, v7, v8
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v8, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v6
; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7
; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v7
@@ -893,17 +895,17 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5
-; VI-SDAG-NEXT: s_and_b32 s2, s4, 0xfffff000
+; VI-SDAG-NEXT: s_and_b32 s0, s4, 0xfffff000
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc
; VI-SDAG-NEXT: v_ldexp_f32 v1, v7, v6
-; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v7, s0
; VI-SDAG-NEXT: v_sub_f32_e32 v7, s4, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3a2784bc, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x40549000, v7
; VI-SDAG-NEXT: v_rndne_f32_e32 v6, v0
; VI-SDAG-NEXT: v_add_f32_e32 v7, v7, v9
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v6
; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v7
; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
@@ -917,9 +919,10 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s3
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2
; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-SDAG-NEXT: s_endpgm
;
@@ -928,7 +931,6 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549000
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3a2784bc
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
@@ -938,14 +940,15 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-GISEL-NEXT: v_mul_f32_e32 v3, s2, v1
; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4
; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v2
-; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s2
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_and_b32 s0, s5, 0xfffff000
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s0
; VI-GISEL-NEXT: v_sub_f32_e32 v5, s5, v5
; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3a2784bc, v5
; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x40549000, v5
-; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v6, s0, v1
; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v7
-; VI-GISEL-NEXT: v_mul_f32_e32 v7, s2, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v7, s0, v2
; VI-GISEL-NEXT: v_add_f32_e32 v5, v7, v5
; VI-GISEL-NEXT: v_rndne_f32_e32 v7, v6
; VI-GISEL-NEXT: v_sub_f32_e32 v6, v6, v7
@@ -954,19 +957,19 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-GISEL-NEXT: v_exp_f32_e32 v5, v5
; VI-GISEL-NEXT: v_add_f32_e32 v0, v4, v0
; VI-GISEL-NEXT: v_rndne_f32_e32 v4, v3
-; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000
+; VI-GISEL-NEXT: s_and_b32 s0, s6, 0xfffff000
; VI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_ldexp_f32 v5, v5, v6
-; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v6, s0
; VI-GISEL-NEXT: v_add_f32_e32 v0, v3, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4
; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3a2784bc, v6
; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x40549000, v6
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v1
; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v8
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, s0, v2
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v6
; VI-GISEL-NEXT: v_rndne_f32_e32 v6, v1
; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v6
@@ -989,19 +992,20 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v3
; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v4
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s3
; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s2
; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-GISEL-NEXT: s_endpgm
;
; GFX900-SDAG-LABEL: s_exp10_v3f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37
; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x421a209b
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v0
; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6
@@ -1045,15 +1049,15 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5
; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; GFX900-SDAG-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX900-SDAG-NEXT: global_store_dwordx3 v4, v[0:2], s[2:3]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_exp10_v3f32:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549a78
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x33979a37
-; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1
; GFX900-GISEL-NEXT: v_fma_f32 v6, s5, v1, -v5
@@ -1098,7 +1102,7 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v3
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
; SI-SDAG-LABEL: s_exp10_v3f32:
@@ -1595,7 +1599,6 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000
; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: s_and_b32 s2, s7, 0xfffff000
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
@@ -1612,37 +1615,38 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3
-; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000
-; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_and_b32 s0, s6, 0xfffff000
+; VI-SDAG-NEXT: v_mov_b32_e32 v7, s0
; VI-SDAG-NEXT: v_sub_f32_e32 v7, s6, v7
; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3a2784bc, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x40549000, v7
; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v2
; VI-SDAG-NEXT: v_add_f32_e32 v7, v7, v8
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v8, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3
; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7
; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v7
; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v7, v3
; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4
-; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000
+; VI-SDAG-NEXT: s_and_b32 s0, s5, 0xfffff000
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v9, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v9, s0
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v6
; VI-SDAG-NEXT: v_sub_f32_e32 v9, s5, v9
; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v8, v1, vcc
; VI-SDAG-NEXT: v_ldexp_f32 v1, v2, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v10, 0x3a2784bc, v9
; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x40549000, v9
; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v2
; VI-SDAG-NEXT: v_add_f32_e32 v9, v9, v10
-; VI-SDAG-NEXT: v_mul_f32_e32 v10, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v10, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v7
; VI-SDAG-NEXT: v_add_f32_e32 v9, v10, v9
; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v9
@@ -1651,17 +1655,17 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v5
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v6
-; VI-SDAG-NEXT: s_and_b32 s2, s4, 0xfffff000
+; VI-SDAG-NEXT: s_and_b32 s0, s4, 0xfffff000
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc
; VI-SDAG-NEXT: v_ldexp_f32 v1, v9, v7
-; VI-SDAG-NEXT: v_mov_b32_e32 v9, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v9, s0
; VI-SDAG-NEXT: v_sub_f32_e32 v9, s4, v9
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v10, 0x3a2784bc, v9
; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x40549000, v9
; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v0
; VI-SDAG-NEXT: v_add_f32_e32 v9, v9, v10
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v7
; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v9
; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
@@ -1675,9 +1679,10 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v5
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s3
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s2
; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-SDAG-NEXT: s_endpgm
;
@@ -1687,7 +1692,6 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x40549000
; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3a2784bc
; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x421a209b
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
@@ -1703,49 +1707,50 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v4
; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000
-; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_and_b32 s0, s5, 0xfffff000
+; VI-GISEL-NEXT: v_mul_f32_e32 v6, s0, v2
; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s0
; VI-GISEL-NEXT: v_sub_f32_e32 v1, s5, v1
; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3a2784bc, v1
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x40549000, v1
; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v7
-; VI-GISEL-NEXT: v_mul_f32_e32 v7, s2, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v7, s0, v3
; VI-GISEL-NEXT: v_add_f32_e32 v1, v7, v1
; VI-GISEL-NEXT: v_rndne_f32_e32 v7, v6
; VI-GISEL-NEXT: v_sub_f32_e32 v6, v6, v7
; VI-GISEL-NEXT: v_add_f32_e32 v1, v6, v1
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v6, v7
; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1
-; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000
-; VI-GISEL-NEXT: v_mul_f32_e32 v8, s2, v2
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4
+; VI-GISEL-NEXT: s_and_b32 s0, s6, 0xfffff000
+; VI-GISEL-NEXT: v_mul_f32_e32 v8, s0, v2
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4
; VI-GISEL-NEXT: v_ldexp_f32 v1, v1, v6
-; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v6, s0
; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6
; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x3a2784bc, v6
; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x40549000, v6
; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v9
-; VI-GISEL-NEXT: v_mul_f32_e32 v9, s2, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v9, s0, v3
; VI-GISEL-NEXT: v_add_f32_e32 v6, v9, v6
; VI-GISEL-NEXT: v_rndne_f32_e32 v9, v8
; VI-GISEL-NEXT: v_sub_f32_e32 v8, v8, v9
; VI-GISEL-NEXT: v_add_f32_e32 v6, v8, v6
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v8, v9
; VI-GISEL-NEXT: v_exp_f32_e32 v6, v6
-; VI-GISEL-NEXT: s_and_b32 s2, s7, 0xfffff000
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, s2, v3
+; VI-GISEL-NEXT: s_and_b32 s0, s7, 0xfffff000
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, s0, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, s0, v3
; VI-GISEL-NEXT: v_ldexp_f32 v6, v6, v8
-; VI-GISEL-NEXT: v_mov_b32_e32 v8, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v8, s0
; VI-GISEL-NEXT: v_sub_f32_e32 v8, s7, v8
; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x3a2784bc, v8
; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x40549000, v8
; VI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9
; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v8
; VI-GISEL-NEXT: v_rndne_f32_e32 v8, v2
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8
; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000
@@ -1766,19 +1771,20 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4
; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s3
; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s2
; VI-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-GISEL-NEXT: s_endpgm
;
; GFX900-SDAG-LABEL: s_exp10_v4f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37
; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0
; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2
@@ -1789,8 +1795,8 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3
; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2
; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b
; GFX900-SDAG-NEXT: v_mov_b32_e32 v9, 0x7f800000
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3
; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0
; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v3
@@ -1835,17 +1841,16 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6
; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
-; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_exp10_v4f32:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x40549a78
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x33979a37
; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x421a209b
-; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2
; GFX900-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0
@@ -1902,7 +1907,7 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
; SI-SDAG-LABEL: s_exp10_v4f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 06fa910..197aa073 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -50,39 +50,39 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) {
;
; VI-SDAG-LABEL: s_exp2_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1
+; VI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1
; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
; VI-SDAG-NEXT: v_mul_f32_e32 v2, v1, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: s_exp2_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-GISEL-NEXT: v_add_f32_e32 v0, s2, v0
+; VI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0
; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
@@ -109,17 +109,18 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) {
; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000
-; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s2, v0
; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
; R600-LABEL: s_exp2_f32:
@@ -445,7 +446,7 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-LABEL: s_exp2_v3f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000
; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000
@@ -467,9 +468,9 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-NEXT: v_exp_f32_e32 v6, v6
; VI-SDAG-NEXT: v_mul_f32_e32 v2, v4, v2
; VI-SDAG-NEXT: v_mul_f32_e32 v0, v3, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s3
; VI-SDAG-NEXT: v_mul_f32_e32 v1, v6, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2
; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-SDAG-NEXT: s_endpgm
;
@@ -730,7 +731,7 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-LABEL: s_exp2_v4f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000
; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000
@@ -757,10 +758,10 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-NEXT: v_exp_f32_e32 v9, v1
; VI-SDAG-NEXT: v_mul_f32_e32 v3, v4, v2
; VI-SDAG-NEXT: v_mul_f32_e32 v2, v6, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s3
; VI-SDAG-NEXT: v_mul_f32_e32 v1, v8, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v0, v9, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s2
; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-SDAG-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
index e8d037c..fca0398 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
@@ -30,58 +30,58 @@ define amdgpu_kernel void @floor_f16(
;
; VI-LABEL: floor_f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_floor_f16_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: floor_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_floor_f16_e32 v0.l, v0.l
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: floor_f16:
; GFX11-FAKE16: ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
-; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
-; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
-; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s7
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, s4
; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, s5
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_floor_f16_e32 v0, v0
-; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FAKE16-NEXT: s_nop 0
; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-NEXT: s_endpgm
@@ -131,37 +131,37 @@ define amdgpu_kernel void @floor_v2f16(
;
; VI-LABEL: floor_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_floor_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_floor_f16_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: floor_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_floor_f16_e32 v0.l, v0.l
@@ -173,31 +173,31 @@ define amdgpu_kernel void @floor_v2f16(
; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: floor_v2f16:
; GFX11-FAKE16: ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
-; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
-; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
-; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s7
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, s4
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, s5
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-FAKE16-NEXT: v_floor_f16_e32 v0, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_floor_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-FAKE16-NEXT: s_nop 0
; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index a2e3060..038ad95 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -48,177 +48,177 @@ define amdgpu_kernel void @fmuladd_f16(
;
; VI-FLUSH-LABEL: fmuladd_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000
-; VI-FLUSH-NEXT: s_mov_b32 s10, -1
-; VI-FLUSH-NEXT: s_mov_b32 s14, s10
-; VI-FLUSH-NEXT: s_mov_b32 s15, s11
+; VI-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_mov_b32 s3, 0xf000
+; VI-FLUSH-NEXT: s_mov_b32 s2, -1
+; VI-FLUSH-NEXT: s_mov_b32 s14, s2
+; VI-FLUSH-NEXT: s_mov_b32 s15, s3
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: s_mov_b32 s12, s2
-; VI-FLUSH-NEXT: s_mov_b32 s13, s3
-; VI-FLUSH-NEXT: s_mov_b32 s16, s4
-; VI-FLUSH-NEXT: s_mov_b32 s17, s5
-; VI-FLUSH-NEXT: s_mov_b32 s18, s10
-; VI-FLUSH-NEXT: s_mov_b32 s19, s11
-; VI-FLUSH-NEXT: s_mov_b32 s4, s6
-; VI-FLUSH-NEXT: s_mov_b32 s5, s7
-; VI-FLUSH-NEXT: s_mov_b32 s6, s10
-; VI-FLUSH-NEXT: s_mov_b32 s7, s11
+; VI-FLUSH-NEXT: s_mov_b32 s12, s6
+; VI-FLUSH-NEXT: s_mov_b32 s13, s7
+; VI-FLUSH-NEXT: s_mov_b32 s16, s8
+; VI-FLUSH-NEXT: s_mov_b32 s17, s9
+; VI-FLUSH-NEXT: s_mov_b32 s18, s2
+; VI-FLUSH-NEXT: s_mov_b32 s19, s3
+; VI-FLUSH-NEXT: s_mov_b32 s8, s10
+; VI-FLUSH-NEXT: s_mov_b32 s9, s11
+; VI-FLUSH-NEXT: s_mov_b32 s10, s2
+; VI-FLUSH-NEXT: s_mov_b32 s11, s3
; VI-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[16:19], 0
-; VI-FLUSH-NEXT: buffer_load_ushort v2, off, s[4:7], 0
-; VI-FLUSH-NEXT: s_mov_b32 s8, s0
-; VI-FLUSH-NEXT: s_mov_b32 s9, s1
+; VI-FLUSH-NEXT: buffer_load_ushort v2, off, s[8:11], 0
+; VI-FLUSH-NEXT: s_mov_b32 s0, s4
+; VI-FLUSH-NEXT: s_mov_b32 s1, s5
; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
; VI-FLUSH-NEXT: v_mac_f16_e32 v2, v0, v1
-; VI-FLUSH-NEXT: buffer_store_short v2, off, s[8:11], 0
+; VI-FLUSH-NEXT: buffer_store_short v2, off, s[0:3], 0
; VI-FLUSH-NEXT: s_endpgm
;
; VI-DENORM-LABEL: fmuladd_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000
-; VI-DENORM-NEXT: s_mov_b32 s10, -1
-; VI-DENORM-NEXT: s_mov_b32 s14, s10
-; VI-DENORM-NEXT: s_mov_b32 s15, s11
+; VI-DENORM-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-DENORM-NEXT: s_mov_b32 s3, 0xf000
+; VI-DENORM-NEXT: s_mov_b32 s2, -1
+; VI-DENORM-NEXT: s_mov_b32 s14, s2
+; VI-DENORM-NEXT: s_mov_b32 s15, s3
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: s_mov_b32 s12, s2
-; VI-DENORM-NEXT: s_mov_b32 s13, s3
-; VI-DENORM-NEXT: s_mov_b32 s16, s4
-; VI-DENORM-NEXT: s_mov_b32 s17, s5
-; VI-DENORM-NEXT: s_mov_b32 s18, s10
-; VI-DENORM-NEXT: s_mov_b32 s19, s11
-; VI-DENORM-NEXT: s_mov_b32 s4, s6
-; VI-DENORM-NEXT: s_mov_b32 s5, s7
-; VI-DENORM-NEXT: s_mov_b32 s6, s10
-; VI-DENORM-NEXT: s_mov_b32 s7, s11
+; VI-DENORM-NEXT: s_mov_b32 s12, s6
+; VI-DENORM-NEXT: s_mov_b32 s13, s7
+; VI-DENORM-NEXT: s_mov_b32 s16, s8
+; VI-DENORM-NEXT: s_mov_b32 s17, s9
+; VI-DENORM-NEXT: s_mov_b32 s18, s2
+; VI-DENORM-NEXT: s_mov_b32 s19, s3
+; VI-DENORM-NEXT: s_mov_b32 s8, s10
+; VI-DENORM-NEXT: s_mov_b32 s9, s11
+; VI-DENORM-NEXT: s_mov_b32 s10, s2
+; VI-DENORM-NEXT: s_mov_b32 s11, s3
; VI-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[16:19], 0
-; VI-DENORM-NEXT: buffer_load_ushort v2, off, s[4:7], 0
-; VI-DENORM-NEXT: s_mov_b32 s8, s0
-; VI-DENORM-NEXT: s_mov_b32 s9, s1
+; VI-DENORM-NEXT: buffer_load_ushort v2, off, s[8:11], 0
+; VI-DENORM-NEXT: s_mov_b32 s0, s4
+; VI-DENORM-NEXT: s_mov_b32 s1, s5
; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
; VI-DENORM-NEXT: v_fma_f16 v0, v0, v1, v2
-; VI-DENORM-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-DENORM-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-DENORM-NEXT: s_endpgm
;
; GFX10-FLUSH-LABEL: fmuladd_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1
-; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
-; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10
-; GFX10-FLUSH-NEXT: s_mov_b32 s15, s11
-; GFX10-FLUSH-NEXT: s_mov_b32 s18, s10
-; GFX10-FLUSH-NEXT: s_mov_b32 s19, s11
+; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_mov_b32 s2, -1
+; GFX10-FLUSH-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-FLUSH-NEXT: s_mov_b32 s14, s2
+; GFX10-FLUSH-NEXT: s_mov_b32 s15, s3
+; GFX10-FLUSH-NEXT: s_mov_b32 s18, s2
+; GFX10-FLUSH-NEXT: s_mov_b32 s19, s3
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: s_mov_b32 s12, s2
-; GFX10-FLUSH-NEXT: s_mov_b32 s13, s3
-; GFX10-FLUSH-NEXT: s_mov_b32 s16, s4
-; GFX10-FLUSH-NEXT: s_mov_b32 s17, s5
+; GFX10-FLUSH-NEXT: s_mov_b32 s12, s6
+; GFX10-FLUSH-NEXT: s_mov_b32 s13, s7
+; GFX10-FLUSH-NEXT: s_mov_b32 s16, s8
+; GFX10-FLUSH-NEXT: s_mov_b32 s17, s9
; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[16:19], 0
-; GFX10-FLUSH-NEXT: s_mov_b32 s4, s6
-; GFX10-FLUSH-NEXT: s_mov_b32 s5, s7
-; GFX10-FLUSH-NEXT: s_mov_b32 s6, s10
-; GFX10-FLUSH-NEXT: s_mov_b32 s7, s11
-; GFX10-FLUSH-NEXT: s_mov_b32 s8, s0
-; GFX10-FLUSH-NEXT: buffer_load_ushort v2, off, s[4:7], 0
-; GFX10-FLUSH-NEXT: s_mov_b32 s9, s1
+; GFX10-FLUSH-NEXT: s_mov_b32 s8, s10
+; GFX10-FLUSH-NEXT: s_mov_b32 s9, s11
+; GFX10-FLUSH-NEXT: s_mov_b32 s10, s2
+; GFX10-FLUSH-NEXT: s_mov_b32 s11, s3
+; GFX10-FLUSH-NEXT: s_mov_b32 s0, s4
+; GFX10-FLUSH-NEXT: buffer_load_ushort v2, off, s[8:11], 0
+; GFX10-FLUSH-NEXT: s_mov_b32 s1, s5
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX10-FLUSH-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, v0, v2
-; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[8:11], 0
+; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX10-DENORM-NEXT: s_mov_b32 s10, -1
-; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000
-; GFX10-DENORM-NEXT: s_mov_b32 s14, s10
-; GFX10-DENORM-NEXT: s_mov_b32 s15, s11
-; GFX10-DENORM-NEXT: s_mov_b32 s18, s10
-; GFX10-DENORM-NEXT: s_mov_b32 s19, s11
-; GFX10-DENORM-NEXT: s_mov_b32 s22, s10
-; GFX10-DENORM-NEXT: s_mov_b32 s23, s11
+; GFX10-DENORM-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_mov_b32 s2, -1
+; GFX10-DENORM-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-DENORM-NEXT: s_mov_b32 s14, s2
+; GFX10-DENORM-NEXT: s_mov_b32 s15, s3
+; GFX10-DENORM-NEXT: s_mov_b32 s18, s2
+; GFX10-DENORM-NEXT: s_mov_b32 s19, s3
+; GFX10-DENORM-NEXT: s_mov_b32 s22, s2
+; GFX10-DENORM-NEXT: s_mov_b32 s23, s3
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: s_mov_b32 s12, s2
-; GFX10-DENORM-NEXT: s_mov_b32 s13, s3
-; GFX10-DENORM-NEXT: s_mov_b32 s16, s4
-; GFX10-DENORM-NEXT: s_mov_b32 s17, s5
-; GFX10-DENORM-NEXT: s_mov_b32 s20, s6
-; GFX10-DENORM-NEXT: s_mov_b32 s21, s7
+; GFX10-DENORM-NEXT: s_mov_b32 s12, s6
+; GFX10-DENORM-NEXT: s_mov_b32 s13, s7
+; GFX10-DENORM-NEXT: s_mov_b32 s16, s8
+; GFX10-DENORM-NEXT: s_mov_b32 s17, s9
+; GFX10-DENORM-NEXT: s_mov_b32 s20, s10
+; GFX10-DENORM-NEXT: s_mov_b32 s21, s11
; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[16:19], 0
; GFX10-DENORM-NEXT: buffer_load_ushort v2, off, s[20:23], 0
-; GFX10-DENORM-NEXT: s_mov_b32 s8, s0
-; GFX10-DENORM-NEXT: s_mov_b32 s9, s1
+; GFX10-DENORM-NEXT: s_mov_b32 s0, s4
+; GFX10-DENORM-NEXT: s_mov_b32 s1, s5
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, v0, v1
-; GFX10-DENORM-NEXT: buffer_store_short v2, off, s[8:11], 0
+; GFX10-DENORM-NEXT: buffer_store_short v2, off, s[0:3], 0
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1
-; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11
-; GFX11-FLUSH-NEXT: s_mov_b32 s18, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s19, s11
+; GFX11-FLUSH-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_mov_b32 s2, -1
+; GFX11-FLUSH-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FLUSH-NEXT: s_mov_b32 s14, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s15, s3
+; GFX11-FLUSH-NEXT: s_mov_b32 s18, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s19, s3
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: s_mov_b32 s12, s2
-; GFX11-FLUSH-NEXT: s_mov_b32 s13, s3
-; GFX11-FLUSH-NEXT: s_mov_b32 s16, s4
-; GFX11-FLUSH-NEXT: s_mov_b32 s17, s5
+; GFX11-FLUSH-NEXT: s_mov_b32 s12, s6
+; GFX11-FLUSH-NEXT: s_mov_b32 s13, s7
+; GFX11-FLUSH-NEXT: s_mov_b32 s16, s8
+; GFX11-FLUSH-NEXT: s_mov_b32 s17, s9
; GFX11-FLUSH-NEXT: buffer_load_u16 v0, off, s[12:15], 0
; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[16:19], 0
-; GFX11-FLUSH-NEXT: s_mov_b32 s4, s6
-; GFX11-FLUSH-NEXT: s_mov_b32 s5, s7
-; GFX11-FLUSH-NEXT: s_mov_b32 s6, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s7, s11
-; GFX11-FLUSH-NEXT: s_mov_b32 s8, s0
-; GFX11-FLUSH-NEXT: buffer_load_u16 v2, off, s[4:7], 0
-; GFX11-FLUSH-NEXT: s_mov_b32 s9, s1
+; GFX11-FLUSH-NEXT: s_mov_b32 s8, s10
+; GFX11-FLUSH-NEXT: s_mov_b32 s9, s11
+; GFX11-FLUSH-NEXT: s_mov_b32 s10, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s11, s3
+; GFX11-FLUSH-NEXT: s_mov_b32 s0, s4
+; GFX11-FLUSH-NEXT: buffer_load_u16 v2, off, s[8:11], 0
+; GFX11-FLUSH-NEXT: s_mov_b32 s1, s5
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v2
-; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-DENORM-NEXT: s_mov_b32 s10, -1
-; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-DENORM-NEXT: s_mov_b32 s14, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s15, s11
-; GFX11-DENORM-NEXT: s_mov_b32 s18, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s19, s11
-; GFX11-DENORM-NEXT: s_mov_b32 s22, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s23, s11
+; GFX11-DENORM-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_mov_b32 s2, -1
+; GFX11-DENORM-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-DENORM-NEXT: s_mov_b32 s14, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s15, s3
+; GFX11-DENORM-NEXT: s_mov_b32 s18, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s19, s3
+; GFX11-DENORM-NEXT: s_mov_b32 s22, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s23, s3
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: s_mov_b32 s12, s2
-; GFX11-DENORM-NEXT: s_mov_b32 s13, s3
-; GFX11-DENORM-NEXT: s_mov_b32 s16, s4
-; GFX11-DENORM-NEXT: s_mov_b32 s17, s5
-; GFX11-DENORM-NEXT: s_mov_b32 s20, s6
-; GFX11-DENORM-NEXT: s_mov_b32 s21, s7
+; GFX11-DENORM-NEXT: s_mov_b32 s12, s6
+; GFX11-DENORM-NEXT: s_mov_b32 s13, s7
+; GFX11-DENORM-NEXT: s_mov_b32 s16, s8
+; GFX11-DENORM-NEXT: s_mov_b32 s17, s9
+; GFX11-DENORM-NEXT: s_mov_b32 s20, s10
+; GFX11-DENORM-NEXT: s_mov_b32 s21, s11
; GFX11-DENORM-NEXT: buffer_load_u16 v0, off, s[12:15], 0
; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[16:19], 0
; GFX11-DENORM-NEXT: buffer_load_u16 v2, off, s[20:23], 0
-; GFX11-DENORM-NEXT: s_mov_b32 s8, s0
-; GFX11-DENORM-NEXT: s_mov_b32 s9, s1
+; GFX11-DENORM-NEXT: s_mov_b32 s0, s4
+; GFX11-DENORM-NEXT: s_mov_b32 s1, s5
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, v0, v1
-; GFX11-DENORM-NEXT: buffer_store_b16 v2, off, s[8:11], 0
+; GFX11-DENORM-NEXT: buffer_store_b16 v2, off, s[0:3], 0
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -361,26 +361,26 @@ define amdgpu_kernel void @fmuladd_f16_imm_a(
; GFX11-FLUSH: ; %bb.0:
; GFX11-FLUSH-NEXT: s_clause 0x1
; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1
-; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11
-; GFX11-FLUSH-NEXT: s_mov_b32 s2, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s3, s11
+; GFX11-FLUSH-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-FLUSH-NEXT: s_mov_b32 s2, -1
+; GFX11-FLUSH-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FLUSH-NEXT: s_mov_b32 s14, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s15, s3
+; GFX11-FLUSH-NEXT: s_mov_b32 s10, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s11, s3
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FLUSH-NEXT: s_mov_b32 s12, s6
; GFX11-FLUSH-NEXT: s_mov_b32 s13, s7
-; GFX11-FLUSH-NEXT: s_mov_b32 s8, s4
+; GFX11-FLUSH-NEXT: s_mov_b32 s0, s4
; GFX11-FLUSH-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: s_mov_b32 s9, s5
+; GFX11-FLUSH-NEXT: s_mov_b32 s1, s5
; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1
-; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
@@ -389,24 +389,24 @@ define amdgpu_kernel void @fmuladd_f16_imm_a(
; GFX11-DENORM: ; %bb.0:
; GFX11-DENORM-NEXT: s_clause 0x1
; GFX11-DENORM-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-DENORM-NEXT: s_mov_b32 s10, -1
-; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-DENORM-NEXT: s_mov_b32 s14, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s15, s11
-; GFX11-DENORM-NEXT: s_mov_b32 s2, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s3, s11
+; GFX11-DENORM-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-DENORM-NEXT: s_mov_b32 s2, -1
+; GFX11-DENORM-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-DENORM-NEXT: s_mov_b32 s14, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s15, s3
+; GFX11-DENORM-NEXT: s_mov_b32 s10, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s11, s3
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DENORM-NEXT: s_mov_b32 s12, s6
; GFX11-DENORM-NEXT: s_mov_b32 s13, s7
; GFX11-DENORM-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: s_mov_b32 s8, s4
-; GFX11-DENORM-NEXT: s_mov_b32 s9, s5
+; GFX11-DENORM-NEXT: s_mov_b32 s0, s4
+; GFX11-DENORM-NEXT: s_mov_b32 s1, s5
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v1, 0x4200, v0
-; GFX11-DENORM-NEXT: buffer_store_b16 v1, off, s[8:11], 0
+; GFX11-DENORM-NEXT: buffer_store_b16 v1, off, s[0:3], 0
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -547,26 +547,26 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
; GFX11-FLUSH: ; %bb.0:
; GFX11-FLUSH-NEXT: s_clause 0x1
; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1
-; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11
-; GFX11-FLUSH-NEXT: s_mov_b32 s2, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s3, s11
+; GFX11-FLUSH-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-FLUSH-NEXT: s_mov_b32 s2, -1
+; GFX11-FLUSH-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FLUSH-NEXT: s_mov_b32 s14, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s15, s3
+; GFX11-FLUSH-NEXT: s_mov_b32 s10, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s11, s3
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FLUSH-NEXT: s_mov_b32 s12, s6
; GFX11-FLUSH-NEXT: s_mov_b32 s13, s7
-; GFX11-FLUSH-NEXT: s_mov_b32 s8, s4
+; GFX11-FLUSH-NEXT: s_mov_b32 s0, s4
; GFX11-FLUSH-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: s_mov_b32 s9, s5
+; GFX11-FLUSH-NEXT: s_mov_b32 s1, s5
; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1
-; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
@@ -575,24 +575,24 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
; GFX11-DENORM: ; %bb.0:
; GFX11-DENORM-NEXT: s_clause 0x1
; GFX11-DENORM-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-DENORM-NEXT: s_mov_b32 s10, -1
-; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-DENORM-NEXT: s_mov_b32 s14, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s15, s11
-; GFX11-DENORM-NEXT: s_mov_b32 s2, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s3, s11
+; GFX11-DENORM-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-DENORM-NEXT: s_mov_b32 s2, -1
+; GFX11-DENORM-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-DENORM-NEXT: s_mov_b32 s14, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s15, s3
+; GFX11-DENORM-NEXT: s_mov_b32 s10, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s11, s3
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DENORM-NEXT: s_mov_b32 s12, s6
; GFX11-DENORM-NEXT: s_mov_b32 s13, s7
; GFX11-DENORM-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: s_mov_b32 s8, s4
-; GFX11-DENORM-NEXT: s_mov_b32 s9, s5
+; GFX11-DENORM-NEXT: s_mov_b32 s0, s4
+; GFX11-DENORM-NEXT: s_mov_b32 s1, s5
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v1, 0x4200, v0
-; GFX11-DENORM-NEXT: buffer_store_b16 v1, off, s[8:11], 0
+; GFX11-DENORM-NEXT: buffer_store_b16 v1, off, s[0:3], 0
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -653,27 +653,27 @@ define amdgpu_kernel void @fmuladd_v2f16(
;
; VI-FLUSH-LABEL: fmuladd_v2f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000
-; VI-FLUSH-NEXT: s_mov_b32 s10, -1
-; VI-FLUSH-NEXT: s_mov_b32 s14, s10
-; VI-FLUSH-NEXT: s_mov_b32 s15, s11
+; VI-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_mov_b32 s3, 0xf000
+; VI-FLUSH-NEXT: s_mov_b32 s2, -1
+; VI-FLUSH-NEXT: s_mov_b32 s14, s2
+; VI-FLUSH-NEXT: s_mov_b32 s15, s3
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: s_mov_b32 s12, s2
-; VI-FLUSH-NEXT: s_mov_b32 s13, s3
-; VI-FLUSH-NEXT: s_mov_b32 s16, s4
-; VI-FLUSH-NEXT: s_mov_b32 s17, s5
-; VI-FLUSH-NEXT: s_mov_b32 s18, s10
-; VI-FLUSH-NEXT: s_mov_b32 s19, s11
-; VI-FLUSH-NEXT: s_mov_b32 s4, s6
-; VI-FLUSH-NEXT: s_mov_b32 s5, s7
-; VI-FLUSH-NEXT: s_mov_b32 s6, s10
-; VI-FLUSH-NEXT: s_mov_b32 s7, s11
+; VI-FLUSH-NEXT: s_mov_b32 s12, s6
+; VI-FLUSH-NEXT: s_mov_b32 s13, s7
+; VI-FLUSH-NEXT: s_mov_b32 s16, s8
+; VI-FLUSH-NEXT: s_mov_b32 s17, s9
+; VI-FLUSH-NEXT: s_mov_b32 s18, s2
+; VI-FLUSH-NEXT: s_mov_b32 s19, s3
+; VI-FLUSH-NEXT: s_mov_b32 s8, s10
+; VI-FLUSH-NEXT: s_mov_b32 s9, s11
+; VI-FLUSH-NEXT: s_mov_b32 s10, s2
+; VI-FLUSH-NEXT: s_mov_b32 s11, s3
; VI-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0
-; VI-FLUSH-NEXT: buffer_load_dword v1, off, s[4:7], 0
+; VI-FLUSH-NEXT: buffer_load_dword v1, off, s[8:11], 0
; VI-FLUSH-NEXT: buffer_load_dword v2, off, s[16:19], 0
-; VI-FLUSH-NEXT: s_mov_b32 s8, s0
-; VI-FLUSH-NEXT: s_mov_b32 s9, s1
+; VI-FLUSH-NEXT: s_mov_b32 s0, s4
+; VI-FLUSH-NEXT: s_mov_b32 s1, s5
; VI-FLUSH-NEXT: s_waitcnt vmcnt(1)
; VI-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
@@ -681,32 +681,32 @@ define amdgpu_kernel void @fmuladd_v2f16(
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-FLUSH-NEXT: v_mac_f16_e32 v1, v0, v2
; VI-FLUSH-NEXT: v_or_b32_e32 v0, v1, v3
-; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-FLUSH-NEXT: s_endpgm
;
; VI-DENORM-LABEL: fmuladd_v2f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000
-; VI-DENORM-NEXT: s_mov_b32 s10, -1
-; VI-DENORM-NEXT: s_mov_b32 s14, s10
-; VI-DENORM-NEXT: s_mov_b32 s15, s11
+; VI-DENORM-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-DENORM-NEXT: s_mov_b32 s3, 0xf000
+; VI-DENORM-NEXT: s_mov_b32 s2, -1
+; VI-DENORM-NEXT: s_mov_b32 s14, s2
+; VI-DENORM-NEXT: s_mov_b32 s15, s3
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: s_mov_b32 s16, s4
-; VI-DENORM-NEXT: s_mov_b32 s17, s5
-; VI-DENORM-NEXT: s_mov_b32 s4, s6
-; VI-DENORM-NEXT: s_mov_b32 s5, s7
-; VI-DENORM-NEXT: s_mov_b32 s6, s10
-; VI-DENORM-NEXT: s_mov_b32 s7, s11
-; VI-DENORM-NEXT: s_mov_b32 s12, s2
-; VI-DENORM-NEXT: s_mov_b32 s13, s3
-; VI-DENORM-NEXT: s_mov_b32 s18, s10
-; VI-DENORM-NEXT: s_mov_b32 s19, s11
-; VI-DENORM-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; VI-DENORM-NEXT: s_mov_b32 s16, s8
+; VI-DENORM-NEXT: s_mov_b32 s17, s9
+; VI-DENORM-NEXT: s_mov_b32 s8, s10
+; VI-DENORM-NEXT: s_mov_b32 s9, s11
+; VI-DENORM-NEXT: s_mov_b32 s10, s2
+; VI-DENORM-NEXT: s_mov_b32 s11, s3
+; VI-DENORM-NEXT: s_mov_b32 s12, s6
+; VI-DENORM-NEXT: s_mov_b32 s13, s7
+; VI-DENORM-NEXT: s_mov_b32 s18, s2
+; VI-DENORM-NEXT: s_mov_b32 s19, s3
+; VI-DENORM-NEXT: buffer_load_dword v0, off, s[8:11], 0
; VI-DENORM-NEXT: buffer_load_dword v1, off, s[16:19], 0
; VI-DENORM-NEXT: buffer_load_dword v2, off, s[12:15], 0
-; VI-DENORM-NEXT: s_mov_b32 s8, s0
-; VI-DENORM-NEXT: s_mov_b32 s9, s1
+; VI-DENORM-NEXT: s_mov_b32 s0, s4
+; VI-DENORM-NEXT: s_mov_b32 s1, s5
; VI-DENORM-NEXT: s_waitcnt vmcnt(2)
; VI-DENORM-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; VI-DENORM-NEXT: s_waitcnt vmcnt(1)
@@ -717,126 +717,126 @@ define amdgpu_kernel void @fmuladd_v2f16(
; VI-DENORM-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-DENORM-NEXT: v_fma_f16 v0, v2, v1, v0
; VI-DENORM-NEXT: v_or_b32_e32 v0, v0, v3
-; VI-DENORM-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-DENORM-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-DENORM-NEXT: s_endpgm
;
; GFX10-FLUSH-LABEL: fmuladd_v2f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1
-; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
-; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10
-; GFX10-FLUSH-NEXT: s_mov_b32 s15, s11
-; GFX10-FLUSH-NEXT: s_mov_b32 s18, s10
-; GFX10-FLUSH-NEXT: s_mov_b32 s19, s11
+; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_mov_b32 s2, -1
+; GFX10-FLUSH-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-FLUSH-NEXT: s_mov_b32 s14, s2
+; GFX10-FLUSH-NEXT: s_mov_b32 s15, s3
+; GFX10-FLUSH-NEXT: s_mov_b32 s18, s2
+; GFX10-FLUSH-NEXT: s_mov_b32 s19, s3
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: s_mov_b32 s12, s2
-; GFX10-FLUSH-NEXT: s_mov_b32 s13, s3
-; GFX10-FLUSH-NEXT: s_mov_b32 s16, s4
-; GFX10-FLUSH-NEXT: s_mov_b32 s17, s5
+; GFX10-FLUSH-NEXT: s_mov_b32 s12, s6
+; GFX10-FLUSH-NEXT: s_mov_b32 s13, s7
+; GFX10-FLUSH-NEXT: s_mov_b32 s16, s8
+; GFX10-FLUSH-NEXT: s_mov_b32 s17, s9
; GFX10-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0
; GFX10-FLUSH-NEXT: buffer_load_dword v1, off, s[16:19], 0
-; GFX10-FLUSH-NEXT: s_mov_b32 s4, s6
-; GFX10-FLUSH-NEXT: s_mov_b32 s5, s7
-; GFX10-FLUSH-NEXT: s_mov_b32 s6, s10
-; GFX10-FLUSH-NEXT: s_mov_b32 s7, s11
-; GFX10-FLUSH-NEXT: s_mov_b32 s8, s0
-; GFX10-FLUSH-NEXT: buffer_load_dword v2, off, s[4:7], 0
-; GFX10-FLUSH-NEXT: s_mov_b32 s9, s1
+; GFX10-FLUSH-NEXT: s_mov_b32 s8, s10
+; GFX10-FLUSH-NEXT: s_mov_b32 s9, s11
+; GFX10-FLUSH-NEXT: s_mov_b32 s10, s2
+; GFX10-FLUSH-NEXT: s_mov_b32 s11, s3
+; GFX10-FLUSH-NEXT: s_mov_b32 s0, s4
+; GFX10-FLUSH-NEXT: buffer_load_dword v2, off, s[8:11], 0
+; GFX10-FLUSH-NEXT: s_mov_b32 s1, s5
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX10-FLUSH-NEXT: v_pk_mul_f16 v0, v0, v1
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_pk_add_f16 v0, v0, v2
-; GFX10-FLUSH-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GFX10-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_v2f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX10-DENORM-NEXT: s_mov_b32 s10, -1
-; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000
-; GFX10-DENORM-NEXT: s_mov_b32 s14, s10
-; GFX10-DENORM-NEXT: s_mov_b32 s15, s11
-; GFX10-DENORM-NEXT: s_mov_b32 s18, s10
-; GFX10-DENORM-NEXT: s_mov_b32 s19, s11
-; GFX10-DENORM-NEXT: s_mov_b32 s22, s10
-; GFX10-DENORM-NEXT: s_mov_b32 s23, s11
+; GFX10-DENORM-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_mov_b32 s2, -1
+; GFX10-DENORM-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-DENORM-NEXT: s_mov_b32 s14, s2
+; GFX10-DENORM-NEXT: s_mov_b32 s15, s3
+; GFX10-DENORM-NEXT: s_mov_b32 s18, s2
+; GFX10-DENORM-NEXT: s_mov_b32 s19, s3
+; GFX10-DENORM-NEXT: s_mov_b32 s22, s2
+; GFX10-DENORM-NEXT: s_mov_b32 s23, s3
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: s_mov_b32 s12, s2
-; GFX10-DENORM-NEXT: s_mov_b32 s13, s3
-; GFX10-DENORM-NEXT: s_mov_b32 s16, s4
-; GFX10-DENORM-NEXT: s_mov_b32 s17, s5
-; GFX10-DENORM-NEXT: s_mov_b32 s20, s6
-; GFX10-DENORM-NEXT: s_mov_b32 s21, s7
+; GFX10-DENORM-NEXT: s_mov_b32 s12, s6
+; GFX10-DENORM-NEXT: s_mov_b32 s13, s7
+; GFX10-DENORM-NEXT: s_mov_b32 s16, s8
+; GFX10-DENORM-NEXT: s_mov_b32 s17, s9
+; GFX10-DENORM-NEXT: s_mov_b32 s20, s10
+; GFX10-DENORM-NEXT: s_mov_b32 s21, s11
; GFX10-DENORM-NEXT: buffer_load_dword v0, off, s[12:15], 0
; GFX10-DENORM-NEXT: buffer_load_dword v1, off, s[16:19], 0
; GFX10-DENORM-NEXT: buffer_load_dword v2, off, s[20:23], 0
-; GFX10-DENORM-NEXT: s_mov_b32 s8, s0
-; GFX10-DENORM-NEXT: s_mov_b32 s9, s1
+; GFX10-DENORM-NEXT: s_mov_b32 s0, s4
+; GFX10-DENORM-NEXT: s_mov_b32 s1, s5
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_pk_fma_f16 v0, v0, v1, v2
-; GFX10-DENORM-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GFX10-DENORM-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_v2f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1
-; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11
-; GFX11-FLUSH-NEXT: s_mov_b32 s18, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s19, s11
+; GFX11-FLUSH-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_mov_b32 s2, -1
+; GFX11-FLUSH-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FLUSH-NEXT: s_mov_b32 s14, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s15, s3
+; GFX11-FLUSH-NEXT: s_mov_b32 s18, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s19, s3
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: s_mov_b32 s12, s2
-; GFX11-FLUSH-NEXT: s_mov_b32 s13, s3
-; GFX11-FLUSH-NEXT: s_mov_b32 s16, s4
-; GFX11-FLUSH-NEXT: s_mov_b32 s17, s5
+; GFX11-FLUSH-NEXT: s_mov_b32 s12, s6
+; GFX11-FLUSH-NEXT: s_mov_b32 s13, s7
+; GFX11-FLUSH-NEXT: s_mov_b32 s16, s8
+; GFX11-FLUSH-NEXT: s_mov_b32 s17, s9
; GFX11-FLUSH-NEXT: buffer_load_b32 v0, off, s[12:15], 0
; GFX11-FLUSH-NEXT: buffer_load_b32 v1, off, s[16:19], 0
-; GFX11-FLUSH-NEXT: s_mov_b32 s4, s6
-; GFX11-FLUSH-NEXT: s_mov_b32 s5, s7
-; GFX11-FLUSH-NEXT: s_mov_b32 s6, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s7, s11
-; GFX11-FLUSH-NEXT: s_mov_b32 s8, s0
-; GFX11-FLUSH-NEXT: buffer_load_b32 v2, off, s[4:7], 0
-; GFX11-FLUSH-NEXT: s_mov_b32 s9, s1
+; GFX11-FLUSH-NEXT: s_mov_b32 s8, s10
+; GFX11-FLUSH-NEXT: s_mov_b32 s9, s11
+; GFX11-FLUSH-NEXT: s_mov_b32 s10, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s11, s3
+; GFX11-FLUSH-NEXT: s_mov_b32 s0, s4
+; GFX11-FLUSH-NEXT: buffer_load_b32 v2, off, s[8:11], 0
+; GFX11-FLUSH-NEXT: s_mov_b32 s1, s5
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX11-FLUSH-NEXT: v_pk_mul_f16 v0, v0, v1
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_pk_add_f16 v0, v0, v2
-; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_v2f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-DENORM-NEXT: s_mov_b32 s10, -1
-; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-DENORM-NEXT: s_mov_b32 s14, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s15, s11
-; GFX11-DENORM-NEXT: s_mov_b32 s18, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s19, s11
-; GFX11-DENORM-NEXT: s_mov_b32 s22, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s23, s11
+; GFX11-DENORM-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_mov_b32 s2, -1
+; GFX11-DENORM-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-DENORM-NEXT: s_mov_b32 s14, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s15, s3
+; GFX11-DENORM-NEXT: s_mov_b32 s18, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s19, s3
+; GFX11-DENORM-NEXT: s_mov_b32 s22, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s23, s3
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: s_mov_b32 s12, s2
-; GFX11-DENORM-NEXT: s_mov_b32 s13, s3
-; GFX11-DENORM-NEXT: s_mov_b32 s16, s4
-; GFX11-DENORM-NEXT: s_mov_b32 s17, s5
-; GFX11-DENORM-NEXT: s_mov_b32 s20, s6
-; GFX11-DENORM-NEXT: s_mov_b32 s21, s7
+; GFX11-DENORM-NEXT: s_mov_b32 s12, s6
+; GFX11-DENORM-NEXT: s_mov_b32 s13, s7
+; GFX11-DENORM-NEXT: s_mov_b32 s16, s8
+; GFX11-DENORM-NEXT: s_mov_b32 s17, s9
+; GFX11-DENORM-NEXT: s_mov_b32 s20, s10
+; GFX11-DENORM-NEXT: s_mov_b32 s21, s11
; GFX11-DENORM-NEXT: buffer_load_b32 v0, off, s[12:15], 0
; GFX11-DENORM-NEXT: buffer_load_b32 v1, off, s[16:19], 0
; GFX11-DENORM-NEXT: buffer_load_b32 v2, off, s[20:23], 0
-; GFX11-DENORM-NEXT: s_mov_b32 s8, s0
-; GFX11-DENORM-NEXT: s_mov_b32 s9, s1
+; GFX11-DENORM-NEXT: s_mov_b32 s0, s4
+; GFX11-DENORM-NEXT: s_mov_b32 s1, s5
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_pk_fma_f16 v0, v0, v1, v2
-; GFX11-DENORM-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-DENORM-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll b/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll
index aca7d3c..df4d3fc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll
@@ -107,46 +107,46 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) {
;
; GFX8-LABEL: kernel_fpmode_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 19)
-; GFX8-NEXT: s_and_b32 s2, 0x7f3ff, s2
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 19)
+; GFX8-NEXT: s_and_b32 s0, 0x7f3ff, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: kernel_fpmode_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 24)
-; GFX9-NEXT: s_and_b32 s2, 0x87f3ff, s2
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 24)
+; GFX9-NEXT: s_and_b32 s0, 0x87f3ff, s0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: kernel_fpmode_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 24)
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 24)
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_and_b32 s2, 0x87f3ff, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-NEXT: s_and_b32 s0, 0x87f3ff, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: kernel_fpmode_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 24)
-; GFX11-NEXT: s_and_b32 s2, 0x87f3ff, s2
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 24)
+; GFX11-NEXT: s_and_b32 s0, 0x87f3ff, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
index ea823f3..1f62bcc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
@@ -28,16 +28,16 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) {
;
; GFX8CHECK-LABEL: sgpr_isnan_bf16:
; GFX8CHECK: ; %bb.0:
-; GFX8CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8CHECK-NEXT: v_mov_b32_e32 v0, 0x7fff
-; GFX8CHECK-NEXT: s_movk_i32 s3, 0x7f80
+; GFX8CHECK-NEXT: s_movk_i32 s0, 0x7f80
; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8CHECK-NEXT: v_and_b32_e32 v0, s2, v0
-; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s3, v0
-; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0
+; GFX8CHECK-NEXT: v_and_b32_e32 v0, s4, v0
+; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s0, v0
+; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s2
; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1
+; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s3
; GFX8CHECK-NEXT: flat_store_dword v[0:1], v2
; GFX8CHECK-NEXT: s_endpgm
;
@@ -58,27 +58,27 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) {
; GFX10CHECK-LABEL: sgpr_isnan_bf16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_clause 0x1
-; GFX10CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX10CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10CHECK-NEXT: v_mov_b32_e32 v1, 0
; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s2
+; GFX10CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s4
; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX10CHECK-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10CHECK-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10CHECK-NEXT: s_endpgm
;
; GFX11CHECK-LABEL: sgpr_isnan_bf16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_clause 0x1
-; GFX11CHECK-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11CHECK-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11CHECK-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11CHECK-NEXT: v_mov_b32_e32 v1, 0
; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s2
+; GFX11CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s4
; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11CHECK-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11CHECK-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11CHECK-NEXT: s_nop 0
; GFX11CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11CHECK-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index da64c37..26c426a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -43,13 +43,13 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) {
;
; GFX8CHECK-LABEL: sgpr_isnan_f16:
; GFX8CHECK: ; %bb.0:
-; GFX8CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[2:3], s2, 3
-; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3]
-; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[0:1], s4, 3
+; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s2
+; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s3
; GFX8CHECK-NEXT: flat_store_dword v[0:1], v2
; GFX8CHECK-NEXT: s_endpgm
;
@@ -67,25 +67,25 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) {
; GFX10CHECK-LABEL: sgpr_isnan_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_clause 0x1
-; GFX10CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX10CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0
; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s2, s2, 3
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s0, s4, 3
+; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s0
+; GFX10CHECK-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10CHECK-NEXT: s_endpgm
;
; GFX11CHECK-LABEL: sgpr_isnan_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_clause 0x1
-; GFX11CHECK-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11CHECK-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11CHECK-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0
; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s2, s2, 3
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, s4, 3
+; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s0
+; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11CHECK-NEXT: s_nop 0
; GFX11CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11CHECK-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
index 347e549..c7e7e7b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
@@ -37,13 +37,13 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) {
;
; GFX8CHECK-LABEL: sgpr_isnan_f32:
; GFX8CHECK: ; %bb.0:
-; GFX8CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8CHECK-NEXT: v_cmp_class_f32_e64 s[2:3], s2, 3
-; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3]
-; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1
+; GFX8CHECK-NEXT: v_cmp_class_f32_e64 s[0:1], s4, 3
+; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s2
+; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s3
; GFX8CHECK-NEXT: flat_store_dword v[0:1], v2
; GFX8CHECK-NEXT: s_endpgm
;
@@ -61,26 +61,26 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) {
; GFX10CHECK-LABEL: sgpr_isnan_f32:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_clause 0x1
-; GFX10CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX10CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0
; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s2, s2, 3
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s0, s4, 3
+; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s0
+; GFX10CHECK-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10CHECK-NEXT: s_endpgm
;
; GFX11CHECK-LABEL: sgpr_isnan_f32:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_clause 0x1
-; GFX11CHECK-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11CHECK-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11CHECK-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0
; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11CHECK-NEXT: v_cmp_class_f32_e64 s2, s2, 3
+; GFX11CHECK-NEXT: v_cmp_class_f32_e64 s0, s4, 3
; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s0
+; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11CHECK-NEXT: s_nop 0
; GFX11CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11CHECK-NEXT: s_endpgm
@@ -115,57 +115,46 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) {
; GFX7GLISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7GLISEL-NEXT: s_endpgm
;
-; GFX8SELDAG-LABEL: sgpr_isnan_f64:
-; GFX8SELDAG: ; %bb.0:
-; GFX8SELDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8SELDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8SELDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX8SELDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX8SELDAG-NEXT: v_cmp_class_f64_e64 s[0:1], s[2:3], 3
-; GFX8SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX8SELDAG-NEXT: flat_store_dword v[0:1], v2
-; GFX8SELDAG-NEXT: s_endpgm
-;
-; GFX8GLISEL-LABEL: sgpr_isnan_f64:
-; GFX8GLISEL: ; %bb.0:
-; GFX8GLISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8GLISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GLISEL-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 3
-; GFX8GLISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GLISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8GLISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3]
-; GFX8GLISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8GLISEL-NEXT: s_endpgm
+; GFX8CHECK-LABEL: sgpr_isnan_f64:
+; GFX8CHECK: ; %bb.0:
+; GFX8CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8CHECK-NEXT: v_cmp_class_f64_e64 s[0:1], s[6:7], 3
+; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s4
+; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s5
+; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX8CHECK-NEXT: flat_store_dword v[0:1], v2
+; GFX8CHECK-NEXT: s_endpgm
;
; GFX9CHECK-LABEL: sgpr_isnan_f64:
; GFX9CHECK: ; %bb.0:
-; GFX9CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0
; GFX9CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9CHECK-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 3
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3]
-; GFX9CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9CHECK-NEXT: v_cmp_class_f64_e64 s[0:1], s[6:7], 3
+; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
+; GFX9CHECK-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9CHECK-NEXT: s_endpgm
;
; GFX10CHECK-LABEL: sgpr_isnan_f64:
; GFX10CHECK: ; %bb.0:
-; GFX10CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0
; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10CHECK-NEXT: v_cmp_class_f64_e64 s2, s[2:3], 3
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10CHECK-NEXT: v_cmp_class_f64_e64 s0, s[6:7], 3
+; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s0
+; GFX10CHECK-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10CHECK-NEXT: s_endpgm
;
; GFX11CHECK-LABEL: sgpr_isnan_f64:
; GFX11CHECK: ; %bb.0:
-; GFX11CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11CHECK-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0
; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11CHECK-NEXT: v_cmp_class_f64_e64 s2, s[2:3], 3
+; GFX11CHECK-NEXT: v_cmp_class_f64_e64 s0, s[6:7], 3
; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s0
+; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11CHECK-NEXT: s_nop 0
; GFX11CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11CHECK-NEXT: s_endpgm
@@ -1469,3 +1458,6 @@ declare <7 x i1> @llvm.is.fpclass.v7f32(<7 x float>, i32)
declare <8 x i1> @llvm.is.fpclass.v8f32(<8 x float>, i32)
declare <16 x i1> @llvm.is.fpclass.v16f32(<16 x float>, i32)
declare <2 x i1> @llvm.is.fpclass.v2f64(<2 x double>, i32)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX8GLISEL: {{.*}}
+; GFX8SELDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index ad70589..6f1d374 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -188,14 +188,13 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
;
; GFX1100-SDAG-LABEL: s_log_f32:
; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
@@ -207,23 +206,23 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
; GFX1100-SDAG-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s4
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
;
; GFX1100-GISEL-LABEL: s_log_f32:
; GFX1100-GISEL: ; %bb.0:
-; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
@@ -233,11 +232,12 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0
; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, s4
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2
-; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
@@ -1090,18 +1090,18 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s2
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s8
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s5, v1
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
@@ -1122,7 +1122,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v7, 0x3377d1cf, v1
; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s9
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_add_f32_e32 v4, v4, v7
; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
@@ -1137,7 +1137,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v0, v9
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v3, v6
-; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1]
+; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
@@ -1146,18 +1146,18 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s2
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s8
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
@@ -1178,7 +1178,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v7, 0x3377d1cf, v1
; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_add_f32_e32 v4, v4, v7
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
@@ -1193,7 +1193,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
@@ -1775,31 +1775,31 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s7
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s5
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s4
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s2
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s8
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s5, v2 :: v_dual_mul_f32 v3, s4, v3
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s9
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2
; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s8
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s9
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s10
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s11
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3
@@ -1827,7 +1827,7 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v3, v0, v4
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v5, v14 :: v_dual_sub_f32 v0, v6, v15
-; GFX1100-SDAG-NEXT: global_store_b128 v7, v[0:3], s[0:1]
+; GFX1100-SDAG-NEXT: global_store_b128 v7, v[0:3], s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
@@ -1836,31 +1836,31 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s2
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s8
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2
; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s8
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s9
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s10
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s11
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3
@@ -1889,7 +1889,7 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v3, v3, v8 :: v_dual_sub_f32 v2, v2, v14
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v3, v3, v15
-; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 82c73fa..e8671f5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -188,14 +188,13 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
;
; GFX1100-SDAG-LABEL: s_log10_f32:
; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
@@ -207,23 +206,23 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
; GFX1100-SDAG-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s4
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
;
; GFX1100-GISEL-LABEL: s_log10_f32:
; GFX1100-GISEL: ; %bb.0:
-; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
@@ -233,11 +232,12 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0
; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, s4
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2
-; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
@@ -1090,18 +1090,18 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s2
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s8
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s5, v1
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
@@ -1122,7 +1122,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v7, 0x3284fbcf, v1
; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s9
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_add_f32_e32 v4, v4, v7
; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
@@ -1137,7 +1137,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v0, v9
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v3, v6
-; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1]
+; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
@@ -1146,18 +1146,18 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s2
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s8
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
@@ -1178,7 +1178,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v7, 0x3284fbcf, v1
; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_add_f32_e32 v4, v4, v7
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
@@ -1193,7 +1193,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
@@ -1775,31 +1775,31 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s7
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s5
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s4
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s2
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s8
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s5, v2 :: v_dual_mul_f32 v3, s4, v3
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s9
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2
; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s8
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s9
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s10
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s11
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3
@@ -1827,7 +1827,7 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v3, v0, v4
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v5, v14 :: v_dual_sub_f32 v0, v6, v15
-; GFX1100-SDAG-NEXT: global_store_b128 v7, v[0:3], s[0:1]
+; GFX1100-SDAG-NEXT: global_store_b128 v7, v[0:3], s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
@@ -1836,31 +1836,31 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s2
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s8
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2
; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s8
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s9
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s10
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s11
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3
@@ -1889,7 +1889,7 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v3, v3, v8 :: v_dual_sub_f32 v2, v2, v14
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v3, v3, v15
-; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index b76e621..88b5e61 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -52,39 +52,39 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
;
; VI-SDAG-LABEL: s_log2_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1
; VI-SDAG-NEXT: v_log_f32_e32 v1, v1
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: s_log2_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
+; VI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
@@ -111,56 +111,57 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
; GFX1100-SDAG-LABEL: s_log2_f32:
; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_clause 0x1
-; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x2c
; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s3
-; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s2
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0
-; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
;
; GFX1100-GISEL-LABEL: s_log2_f32:
; GFX1100-GISEL: ; %bb.0:
-; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s4
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v1 :: v_dual_mov_b32 v1, 0
-; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
@@ -537,7 +538,7 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-LABEL: s_log2_v3f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000
@@ -559,9 +560,9 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-NEXT: v_log_f32_e32 v6, v6
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s3
; VI-SDAG-NEXT: v_sub_f32_e32 v1, v6, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2
; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-SDAG-NEXT: s_endpgm
;
@@ -658,9 +659,7 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
;
; GFX1100-SDAG-LABEL: s_log2_v3f32:
; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
@@ -668,23 +667,25 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s2
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, s3
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s7
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3
-; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2
-; GFX1100-SDAG-NEXT: v_dual_mul_f32 v4, s5, v4 :: v_dual_mul_f32 v5, s4, v5
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v5, s4, v5
+; GFX1100-SDAG-NEXT: v_mul_f32_e32 v4, s5, v4
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s7
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2
-; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v4
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v5
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_3)
+; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v4
; GFX1100-SDAG-NEXT: v_mov_b32_e32 v6, 0
+; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v2, v0
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT: v_dual_sub_f32 v2, v2, v0 :: v_dual_sub_f32 v1, v4, v1
-; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v5, v3
-; GFX1100-SDAG-NEXT: global_store_b96 v6, v[0:2], s[0:1]
+; GFX1100-SDAG-NEXT: v_dual_sub_f32 v0, v5, v3 :: v_dual_sub_f32 v1, v4, v1
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_store_b96 v6, v[0:2], s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
@@ -693,20 +694,20 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s2
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s7
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
@@ -717,7 +718,7 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v5
-; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
@@ -888,7 +889,7 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-LABEL: s_log2_v4f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000
@@ -915,10 +916,10 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-NEXT: v_log_f32_e32 v9, v1
; VI-SDAG-NEXT: v_sub_f32_e32 v3, v4, v2
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v6, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s3
; VI-SDAG-NEXT: v_sub_f32_e32 v1, v8, v7
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v9, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s2
; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-SDAG-NEXT: s_endpgm
;
@@ -1033,9 +1034,7 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
;
; GFX1100-SDAG-LABEL: s_log2_v4f32:
; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6
@@ -1048,23 +1047,24 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 1.0, 0x4f800000, s8
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v7, 1.0, 0x4f800000, s9
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s7, v2 :: v_dual_mul_f32 v3, s6, v3
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v6, s5, v6 :: v_dual_mul_f32 v7, s4, v7
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2
; GFX1100-SDAG-NEXT: v_log_f32_e32 v8, v3
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(TRANS32_DEP_3)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v6, v6
; GFX1100-SDAG-NEXT: v_log_f32_e32 v7, v7
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9
; GFX1100-SDAG-NEXT: v_mov_b32_e32 v9, 0
; GFX1100-SDAG-NEXT: v_dual_sub_f32 v3, v2, v0 :: v_dual_sub_f32 v2, v8, v1
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v6, v4 :: v_dual_sub_f32 v0, v7, v5
-; GFX1100-SDAG-NEXT: global_store_b128 v9, v[0:3], s[0:1]
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_store_b128 v9, v[0:3], s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
@@ -1073,36 +1073,36 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s2
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2
; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s8
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s9
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s10
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s11
; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_dual_sub_f32 v2, v2, v6 :: v_dual_sub_f32 v3, v3, v7
; GFX1100-GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index d056a97..b8065d2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -117,27 +117,27 @@ define amdgpu_kernel void @maxnum_f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: v_max_f16_e32 v1, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f16_e32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -175,79 +175,79 @@ define amdgpu_kernel void @maxnum_f16_imm_a(
;
; VI-LABEL: maxnum_f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: v_max_f16_e32 v0, 0x4200, v0
-; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: maxnum_f16_imm_a:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
; GFX10-NEXT: v_max_f16_e32 v0, 0x4200, v0
-; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: maxnum_f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f16_e32 v0, 0x4200, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -283,79 +283,79 @@ define amdgpu_kernel void @maxnum_f16_imm_b(
;
; VI-LABEL: maxnum_f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_max_f16_e32 v0, 4.0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_f16_imm_b:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: v_max_f16_e32 v0, 4.0, v0
-; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: maxnum_f16_imm_b:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
; GFX10-NEXT: v_max_f16_e32 v0, 4.0, v0
-; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: maxnum_f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f16_e32 v0, 4.0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -396,26 +396,26 @@ define amdgpu_kernel void @maxnum_v2f16(
;
; VI-LABEL: maxnum_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s8, s[4:5], 0x0
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dword s8, s[8:9], 0x0
+; VI-NEXT: s_load_dword s6, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v1, s2, s2
-; VI-NEXT: s_lshr_b32 s0, s8, 16
+; VI-NEXT: v_max_f16_e64 v1, s6, s6
+; VI-NEXT: s_lshr_b32 s4, s8, 16
; VI-NEXT: v_max_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_v2f16:
@@ -456,19 +456,19 @@ define amdgpu_kernel void @maxnum_v2f16(
; GFX11-LABEL: maxnum_v2f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s4, s4
-; GFX11-NEXT: v_pk_max_f16 v1, s2, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s0
+; GFX11-NEXT: v_pk_max_f16 v1, s1, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v0, v1, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -506,61 +506,61 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a(
;
; VI-LABEL: maxnum_v2f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4400
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v0, s4, s4
-; VI-NEXT: s_lshr_b32 s4, s4, 16
-; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: v_max_f16_e64 v0, s0, s0
+; VI-NEXT: s_lshr_b32 s0, s0, 16
+; VI-NEXT: v_max_f16_e64 v1, s0, s0
; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0
; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_v2f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
-; GFX9-NEXT: s_mov_b32 s4, 0x44004200
-; GFX9-NEXT: v_pk_max_f16 v0, v0, s4
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: v_pk_max_f16 v0, s0, s0
+; GFX9-NEXT: s_mov_b32 s0, 0x44004200
+; GFX9-NEXT: v_pk_max_f16 v0, v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: maxnum_v2f16_imm_a:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v0, s2, s2
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: v_pk_max_f16 v0, s0, s0
; GFX10-NEXT: v_pk_max_f16 v0, 0x44004200, v0
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: maxnum_v2f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s2, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v0, 0x44004200, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -596,61 +596,61 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b(
;
; VI-LABEL: maxnum_v2f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4200
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v0, s4, s4
-; VI-NEXT: s_lshr_b32 s4, s4, 16
-; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: v_max_f16_e64 v0, s0, s0
+; VI-NEXT: s_lshr_b32 s0, s0, 16
+; VI-NEXT: v_max_f16_e64 v1, s0, s0
; VI-NEXT: v_max_f16_e32 v0, 4.0, v0
; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_v2f16_imm_b:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
-; GFX9-NEXT: s_mov_b32 s4, 0x42004400
-; GFX9-NEXT: v_pk_max_f16 v0, v0, s4
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: v_pk_max_f16 v0, s0, s0
+; GFX9-NEXT: s_mov_b32 s0, 0x42004400
+; GFX9-NEXT: v_pk_max_f16 v0, v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: maxnum_v2f16_imm_b:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v0, s2, s2
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: v_pk_max_f16 v0, s0, s0
; GFX10-NEXT: v_pk_max_f16 v0, 0x42004400, v0
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: maxnum_v2f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s2, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v0, 0x42004400, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -697,30 +697,30 @@ define amdgpu_kernel void @maxnum_v3f16(
;
; VI-LABEL: maxnum_v3f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v1, s2, s2
-; VI-NEXT: s_lshr_b32 s0, s8, 16
+; VI-NEXT: v_max_f16_e64 v1, s6, s6
+; VI-NEXT: s_lshr_b32 s4, s8, 16
; VI-NEXT: v_max_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_max_f16_e64 v1, s9, s9
-; VI-NEXT: v_max_f16_e64 v2, s3, s3
+; VI-NEXT: v_max_f16_e64 v2, s7, s7
; VI-NEXT: v_max_f16_e32 v1, v2, v1
-; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_v3f16:
@@ -769,24 +769,24 @@ define amdgpu_kernel void @maxnum_v3f16(
; GFX11-LABEL: maxnum_v3f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
-; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[2:3], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v1, s5, s5
+; GFX11-NEXT: v_pk_max_f16 v1, s1, s1
; GFX11-NEXT: v_pk_max_f16 v2, s3, s3
-; GFX11-NEXT: v_pk_max_f16 v0, s4, s4
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s0
; GFX11-NEXT: v_pk_max_f16 v3, s2, s2
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_pk_max_f16 v1, v2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_pk_max_f16 v0, v3, v0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:4
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:4
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -843,35 +843,35 @@ define amdgpu_kernel void @maxnum_v4f16(
;
; VI-LABEL: maxnum_v4f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_max_f16_e64 v0, s9, s9
-; VI-NEXT: v_max_f16_e64 v1, s3, s3
-; VI-NEXT: s_lshr_b32 s0, s9, 16
+; VI-NEXT: v_max_f16_e64 v1, s7, s7
+; VI-NEXT: s_lshr_b32 s4, s9, 16
; VI-NEXT: v_max_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s3, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s7, 16
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v1, v0, v1
; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v2, s2, s2
-; VI-NEXT: s_lshr_b32 s0, s8, 16
+; VI-NEXT: v_max_f16_e64 v2, s6, s6
+; VI-NEXT: s_lshr_b32 s4, s8, 16
; VI-NEXT: v_max_f16_e32 v0, v2, v0
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v3, s0, s0
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: v_max_f16_e64 v3, s4, s4
; VI-NEXT: v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_v4f16:
@@ -918,22 +918,22 @@ define amdgpu_kernel void @maxnum_v4f16(
; GFX11-LABEL: maxnum_v4f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
-; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[2:3], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s5, s5
+; GFX11-NEXT: v_pk_max_f16 v0, s1, s1
; GFX11-NEXT: v_pk_max_f16 v1, s3, s3
-; GFX11-NEXT: v_pk_max_f16 v2, s4, s4
+; GFX11-NEXT: v_pk_max_f16 v2, s0, s0
; GFX11-NEXT: v_pk_max_f16 v3, s2, s2
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_pk_max_f16 v0, v3, v2
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -980,79 +980,79 @@ define amdgpu_kernel void @fmax_v4f16_imm_a(
;
; VI-LABEL: fmax_v4f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x4400
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s0, s3, 16
-; VI-NEXT: v_max_f16_e64 v1, s3, s3
-; VI-NEXT: v_max_f16_e64 v3, s0, s0
-; VI-NEXT: v_max_f16_e64 v2, s2, s2
+; VI-NEXT: s_lshr_b32 s4, s7, 16
+; VI-NEXT: v_max_f16_e64 v1, s7, s7
+; VI-NEXT: v_max_f16_e64 v3, s4, s4
+; VI-NEXT: v_max_f16_e64 v2, s6, s6
; VI-NEXT: v_max_f16_e32 v1, 0x4200, v1
; VI-NEXT: v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_lshr_b32 s0, s2, 16
+; VI-NEXT: s_lshr_b32 s4, s6, 16
; VI-NEXT: v_or_b32_e32 v1, v1, v0
; VI-NEXT: v_max_f16_e32 v0, 0x4800, v2
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_mov_b32_e32 v3, 0x4000
; VI-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fmax_v4f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s8, 0x44004200
; GFX9-NEXT: s_mov_b32 s9, 0x40004800
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s3, s3
-; GFX9-NEXT: v_pk_max_f16 v2, s2, s2
+; GFX9-NEXT: v_pk_max_f16 v0, s7, s7
+; GFX9-NEXT: v_pk_max_f16 v2, s6, s6
; GFX9-NEXT: v_pk_max_f16 v1, v0, s8
; GFX9-NEXT: v_pk_max_f16 v0, v2, s9
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: fmax_v4f16_imm_a:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v0, s3, s3
-; GFX10-NEXT: v_pk_max_f16 v2, s2, s2
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: v_pk_max_f16 v0, s1, s1
+; GFX10-NEXT: v_pk_max_f16 v2, s0, s0
; GFX10-NEXT: v_pk_max_f16 v1, 0x44004200, v0
; GFX10-NEXT: v_pk_max_f16 v0, 0x40004800, v2
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fmax_v4f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s3, s3
-; GFX11-NEXT: v_pk_max_f16 v2, s2, s2
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_pk_max_f16 v0, s1, s1
+; GFX11-NEXT: v_pk_max_f16 v2, s0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_pk_max_f16 v1, 0x44004200, v0
; GFX11-NEXT: v_pk_max_f16 v0, 0x40004800, v2
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index f934a2d..a78fc3a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -117,26 +117,26 @@ define amdgpu_kernel void @minnum_f16_ieee(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: v_max_f16_e32 v1, v1, v1
; GFX11-NEXT: v_min_f16_e32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -202,78 +202,78 @@ define amdgpu_kernel void @minnum_f16_imm_a(
;
; VI-LABEL: minnum_f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: v_min_f16_e32 v0, 0x4200, v0
-; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: minnum_f16_imm_a:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
; GFX10-NEXT: v_min_f16_e32 v0, 0x4200, v0
-; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: minnum_f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: v_min_f16_e32 v0, 0x4200, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -309,78 +309,78 @@ define amdgpu_kernel void @minnum_f16_imm_b(
;
; VI-LABEL: minnum_f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_min_f16_e32 v0, 4.0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_f16_imm_b:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: v_min_f16_e32 v0, 4.0, v0
-; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: minnum_f16_imm_b:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
; GFX10-NEXT: v_min_f16_e32 v0, 4.0, v0
-; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: minnum_f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: v_min_f16_e32 v0, 4.0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -421,26 +421,26 @@ define amdgpu_kernel void @minnum_v2f16_ieee(
;
; VI-LABEL: minnum_v2f16_ieee:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s8, s[4:5], 0x0
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dword s8, s[8:9], 0x0
+; VI-NEXT: s_load_dword s6, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v1, s2, s2
-; VI-NEXT: s_lshr_b32 s0, s8, 16
+; VI-NEXT: v_max_f16_e64 v1, s6, s6
+; VI-NEXT: s_lshr_b32 s4, s8, 16
; VI-NEXT: v_min_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_v2f16_ieee:
@@ -481,18 +481,18 @@ define amdgpu_kernel void @minnum_v2f16_ieee(
; GFX11-LABEL: minnum_v2f16_ieee:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s4, s4
-; GFX11-NEXT: v_pk_max_f16 v1, s2, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s0
+; GFX11-NEXT: v_pk_max_f16 v1, s1, s1
; GFX11-NEXT: v_pk_min_f16 v0, v1, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -565,60 +565,60 @@ define amdgpu_kernel void @minnum_v2f16_imm_a(
;
; VI-LABEL: minnum_v2f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4400
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v0, s4, s4
-; VI-NEXT: s_lshr_b32 s4, s4, 16
-; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: v_max_f16_e64 v0, s0, s0
+; VI-NEXT: s_lshr_b32 s0, s0, 16
+; VI-NEXT: v_max_f16_e64 v1, s0, s0
; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0
; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_v2f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
-; GFX9-NEXT: s_mov_b32 s4, 0x44004200
-; GFX9-NEXT: v_pk_min_f16 v0, v0, s4
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: v_pk_max_f16 v0, s0, s0
+; GFX9-NEXT: s_mov_b32 s0, 0x44004200
+; GFX9-NEXT: v_pk_min_f16 v0, v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: minnum_v2f16_imm_a:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v0, s2, s2
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: v_pk_max_f16 v0, s0, s0
; GFX10-NEXT: v_pk_min_f16 v0, 0x44004200, v0
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: minnum_v2f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s2, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s0
; GFX11-NEXT: v_pk_min_f16 v0, 0x44004200, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -654,60 +654,60 @@ define amdgpu_kernel void @minnum_v2f16_imm_b(
;
; VI-LABEL: minnum_v2f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4200
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v0, s4, s4
-; VI-NEXT: s_lshr_b32 s4, s4, 16
-; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: v_max_f16_e64 v0, s0, s0
+; VI-NEXT: s_lshr_b32 s0, s0, 16
+; VI-NEXT: v_max_f16_e64 v1, s0, s0
; VI-NEXT: v_min_f16_e32 v0, 4.0, v0
; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_v2f16_imm_b:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
-; GFX9-NEXT: s_mov_b32 s4, 0x42004400
-; GFX9-NEXT: v_pk_min_f16 v0, v0, s4
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: v_pk_max_f16 v0, s0, s0
+; GFX9-NEXT: s_mov_b32 s0, 0x42004400
+; GFX9-NEXT: v_pk_min_f16 v0, v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: minnum_v2f16_imm_b:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v0, s2, s2
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: v_pk_max_f16 v0, s0, s0
; GFX10-NEXT: v_pk_min_f16 v0, 0x42004400, v0
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: minnum_v2f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s2, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s0
; GFX11-NEXT: v_pk_min_f16 v0, 0x42004400, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -754,30 +754,30 @@ define amdgpu_kernel void @minnum_v3f16(
;
; VI-LABEL: minnum_v3f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v1, s2, s2
-; VI-NEXT: s_lshr_b32 s0, s8, 16
+; VI-NEXT: v_max_f16_e64 v1, s6, s6
+; VI-NEXT: s_lshr_b32 s4, s8, 16
; VI-NEXT: v_min_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_max_f16_e64 v1, s9, s9
-; VI-NEXT: v_max_f16_e64 v2, s3, s3
+; VI-NEXT: v_max_f16_e64 v2, s7, s7
; VI-NEXT: v_min_f16_e32 v1, v2, v1
-; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_v3f16:
@@ -826,23 +826,23 @@ define amdgpu_kernel void @minnum_v3f16(
; GFX11-LABEL: minnum_v3f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
-; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[2:3], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v1, s5, s5
+; GFX11-NEXT: v_pk_max_f16 v1, s1, s1
; GFX11-NEXT: v_pk_max_f16 v2, s3, s3
-; GFX11-NEXT: v_pk_max_f16 v0, s4, s4
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s0
; GFX11-NEXT: v_pk_max_f16 v3, s2, s2
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: v_pk_min_f16 v1, v2, v1
; GFX11-NEXT: v_pk_min_f16 v0, v3, v0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:4
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:4
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -899,35 +899,35 @@ define amdgpu_kernel void @minnum_v4f16(
;
; VI-LABEL: minnum_v4f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_max_f16_e64 v0, s9, s9
-; VI-NEXT: v_max_f16_e64 v1, s3, s3
-; VI-NEXT: s_lshr_b32 s0, s9, 16
+; VI-NEXT: v_max_f16_e64 v1, s7, s7
+; VI-NEXT: s_lshr_b32 s4, s9, 16
; VI-NEXT: v_min_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s3, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s7, 16
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v1, v0, v1
; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v2, s2, s2
-; VI-NEXT: s_lshr_b32 s0, s8, 16
+; VI-NEXT: v_max_f16_e64 v2, s6, s6
+; VI-NEXT: s_lshr_b32 s4, s8, 16
; VI-NEXT: v_min_f16_e32 v0, v2, v0
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v3, s0, s0
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: v_max_f16_e64 v3, s4, s4
; VI-NEXT: v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_v4f16:
@@ -974,21 +974,21 @@ define amdgpu_kernel void @minnum_v4f16(
; GFX11-LABEL: minnum_v4f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
-; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[2:3], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s5, s5
+; GFX11-NEXT: v_pk_max_f16 v0, s1, s1
; GFX11-NEXT: v_pk_max_f16 v1, s3, s3
-; GFX11-NEXT: v_pk_max_f16 v2, s4, s4
+; GFX11-NEXT: v_pk_max_f16 v2, s0, s0
; GFX11-NEXT: v_pk_max_f16 v3, s2, s2
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: v_pk_min_f16 v1, v1, v0
; GFX11-NEXT: v_pk_min_f16 v0, v3, v2
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1035,78 +1035,78 @@ define amdgpu_kernel void @fmin_v4f16_imm_a(
;
; VI-LABEL: fmin_v4f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x4400
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s0, s3, 16
-; VI-NEXT: v_max_f16_e64 v1, s3, s3
-; VI-NEXT: v_max_f16_e64 v3, s0, s0
-; VI-NEXT: v_max_f16_e64 v2, s2, s2
+; VI-NEXT: s_lshr_b32 s4, s7, 16
+; VI-NEXT: v_max_f16_e64 v1, s7, s7
+; VI-NEXT: v_max_f16_e64 v3, s4, s4
+; VI-NEXT: v_max_f16_e64 v2, s6, s6
; VI-NEXT: v_min_f16_e32 v1, 0x4200, v1
; VI-NEXT: v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_lshr_b32 s0, s2, 16
+; VI-NEXT: s_lshr_b32 s4, s6, 16
; VI-NEXT: v_or_b32_e32 v1, v1, v0
; VI-NEXT: v_min_f16_e32 v0, 0x4800, v2
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_mov_b32_e32 v3, 0x4000
; VI-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fmin_v4f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s8, 0x44004200
; GFX9-NEXT: s_mov_b32 s9, 0x40004800
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s3, s3
-; GFX9-NEXT: v_pk_max_f16 v2, s2, s2
+; GFX9-NEXT: v_pk_max_f16 v0, s7, s7
+; GFX9-NEXT: v_pk_max_f16 v2, s6, s6
; GFX9-NEXT: v_pk_min_f16 v1, v0, s8
; GFX9-NEXT: v_pk_min_f16 v0, v2, s9
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: fmin_v4f16_imm_a:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v0, s3, s3
-; GFX10-NEXT: v_pk_max_f16 v2, s2, s2
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: v_pk_max_f16 v0, s1, s1
+; GFX10-NEXT: v_pk_max_f16 v2, s0, s0
; GFX10-NEXT: v_pk_min_f16 v1, 0x44004200, v0
; GFX10-NEXT: v_pk_min_f16 v0, 0x40004800, v2
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fmin_v4f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s3, s3
-; GFX11-NEXT: v_pk_max_f16 v2, s2, s2
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_pk_max_f16 v0, s1, s1
+; GFX11-NEXT: v_pk_max_f16 v2, s0, s0
; GFX11-NEXT: v_pk_min_f16 v1, 0x44004200, v0
; GFX11-NEXT: v_pk_min_f16 v0, 0x40004800, v2
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index c3e665f..1423575 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -365,57 +365,57 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
;
; GFX9-LABEL: umulo_i64_s:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s7, s0, s3
-; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX9-NEXT: s_mul_hi_u32 s5, s0, s3
-; GFX9-NEXT: s_add_u32 s9, s8, s7
-; GFX9-NEXT: s_mul_i32 s6, s1, s2
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX9-NEXT: s_add_u32 s9, s9, s6
-; GFX9-NEXT: s_mul_hi_u32 s10, s1, s3
-; GFX9-NEXT: s_addc_u32 s4, s5, s4
-; GFX9-NEXT: s_addc_u32 s5, s10, 0
-; GFX9-NEXT: s_mul_i32 s1, s1, s3
-; GFX9-NEXT: s_add_u32 s4, s4, s1
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_add_i32 s1, s8, s7
-; GFX9-NEXT: s_add_i32 s1, s1, s6
-; GFX9-NEXT: s_mul_i32 s0, s0, s2
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_cselect_b32 s1, 0, s1
-; GFX9-NEXT: s_cselect_b32 s0, 0, s0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_mul_i32 s3, s4, s7
+; GFX9-NEXT: s_mul_hi_u32 s8, s4, s6
+; GFX9-NEXT: s_mul_hi_u32 s1, s4, s7
+; GFX9-NEXT: s_add_u32 s9, s8, s3
+; GFX9-NEXT: s_mul_i32 s2, s5, s6
+; GFX9-NEXT: s_addc_u32 s1, 0, s1
+; GFX9-NEXT: s_mul_hi_u32 s0, s5, s6
+; GFX9-NEXT: s_add_u32 s9, s9, s2
+; GFX9-NEXT: s_mul_hi_u32 s10, s5, s7
+; GFX9-NEXT: s_addc_u32 s0, s1, s0
+; GFX9-NEXT: s_addc_u32 s1, s10, 0
+; GFX9-NEXT: s_mul_i32 s5, s5, s7
+; GFX9-NEXT: s_add_u32 s0, s0, s5
+; GFX9-NEXT: s_addc_u32 s1, 0, s1
+; GFX9-NEXT: s_add_i32 s3, s8, s3
+; GFX9-NEXT: s_add_i32 s3, s3, s2
+; GFX9-NEXT: s_mul_i32 s2, s4, s6
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cselect_b32 s0, 0, s3
+; GFX9-NEXT: s_cselect_b32 s1, 0, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: umulo_i64_s:
; GFX10: ; %bb.0: ; %bb
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mul_i32 s7, s0, s3
-; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX10-NEXT: s_mul_hi_u32 s5, s0, s3
-; GFX10-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX10-NEXT: s_mul_i32 s6, s1, s2
-; GFX10-NEXT: s_mul_hi_u32 s9, s1, s3
-; GFX10-NEXT: s_mul_i32 s1, s1, s3
-; GFX10-NEXT: s_add_u32 s3, s8, s7
-; GFX10-NEXT: s_addc_u32 s5, 0, s5
-; GFX10-NEXT: s_add_u32 s3, s3, s6
-; GFX10-NEXT: s_addc_u32 s3, s5, s4
-; GFX10-NEXT: s_addc_u32 s5, s9, 0
-; GFX10-NEXT: s_add_u32 s4, s3, s1
-; GFX10-NEXT: s_addc_u32 s5, 0, s5
-; GFX10-NEXT: s_add_i32 s1, s8, s7
-; GFX10-NEXT: s_mul_i32 s0, s0, s2
-; GFX10-NEXT: s_add_i32 s1, s1, s6
-; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX10-NEXT: s_cselect_b32 s0, 0, s0
-; GFX10-NEXT: s_cselect_b32 s1, 0, s1
+; GFX10-NEXT: s_mul_i32 s3, s4, s7
+; GFX10-NEXT: s_mul_hi_u32 s8, s4, s6
+; GFX10-NEXT: s_mul_hi_u32 s1, s4, s7
+; GFX10-NEXT: s_mul_hi_u32 s0, s5, s6
+; GFX10-NEXT: s_mul_i32 s2, s5, s6
+; GFX10-NEXT: s_mul_hi_u32 s9, s5, s7
+; GFX10-NEXT: s_mul_i32 s5, s5, s7
+; GFX10-NEXT: s_add_u32 s7, s8, s3
+; GFX10-NEXT: s_addc_u32 s1, 0, s1
+; GFX10-NEXT: s_add_u32 s7, s7, s2
+; GFX10-NEXT: s_addc_u32 s0, s1, s0
+; GFX10-NEXT: s_addc_u32 s1, s9, 0
+; GFX10-NEXT: s_add_u32 s0, s0, s5
+; GFX10-NEXT: s_addc_u32 s1, 0, s1
+; GFX10-NEXT: s_add_i32 s3, s8, s3
+; GFX10-NEXT: s_mul_i32 s4, s4, s6
+; GFX10-NEXT: s_add_i32 s3, s3, s2
+; GFX10-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10-NEXT: s_cselect_b32 s0, 0, s4
+; GFX10-NEXT: s_cselect_b32 s1, 0, s3
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
@@ -423,28 +423,28 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
;
; GFX11-LABEL: umulo_i64_s:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s7, s0, s3
-; GFX11-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX11-NEXT: s_mul_hi_u32 s5, s0, s3
-; GFX11-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX11-NEXT: s_mul_i32 s6, s1, s2
-; GFX11-NEXT: s_mul_hi_u32 s9, s1, s3
-; GFX11-NEXT: s_mul_i32 s1, s1, s3
-; GFX11-NEXT: s_add_u32 s3, s8, s7
-; GFX11-NEXT: s_addc_u32 s5, 0, s5
-; GFX11-NEXT: s_add_u32 s3, s3, s6
-; GFX11-NEXT: s_addc_u32 s3, s5, s4
-; GFX11-NEXT: s_addc_u32 s5, s9, 0
-; GFX11-NEXT: s_add_u32 s4, s3, s1
-; GFX11-NEXT: s_addc_u32 s5, 0, s5
-; GFX11-NEXT: s_add_i32 s1, s8, s7
-; GFX11-NEXT: s_mul_i32 s0, s0, s2
-; GFX11-NEXT: s_add_i32 s1, s1, s6
-; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX11-NEXT: s_cselect_b32 s0, 0, s0
-; GFX11-NEXT: s_cselect_b32 s1, 0, s1
+; GFX11-NEXT: s_mul_i32 s3, s4, s7
+; GFX11-NEXT: s_mul_hi_u32 s8, s4, s6
+; GFX11-NEXT: s_mul_hi_u32 s1, s4, s7
+; GFX11-NEXT: s_mul_hi_u32 s0, s5, s6
+; GFX11-NEXT: s_mul_i32 s2, s5, s6
+; GFX11-NEXT: s_mul_hi_u32 s9, s5, s7
+; GFX11-NEXT: s_mul_i32 s5, s5, s7
+; GFX11-NEXT: s_add_u32 s7, s8, s3
+; GFX11-NEXT: s_addc_u32 s1, 0, s1
+; GFX11-NEXT: s_add_u32 s7, s7, s2
+; GFX11-NEXT: s_addc_u32 s0, s1, s0
+; GFX11-NEXT: s_addc_u32 s1, s9, 0
+; GFX11-NEXT: s_add_u32 s0, s0, s5
+; GFX11-NEXT: s_addc_u32 s1, 0, s1
+; GFX11-NEXT: s_add_i32 s3, s8, s3
+; GFX11-NEXT: s_mul_i32 s4, s4, s6
+; GFX11-NEXT: s_add_i32 s3, s3, s2
+; GFX11-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11-NEXT: s_cselect_b32 s0, 0, s4
+; GFX11-NEXT: s_cselect_b32 s1, 0, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
@@ -454,26 +454,26 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
;
; GFX12-LABEL: umulo_i64_s:
; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mul_hi_u32 s7, s0, s3
-; GFX12-NEXT: s_mul_i32 s6, s0, s3
-; GFX12-NEXT: s_mul_hi_u32 s4, s0, s2
-; GFX12-NEXT: s_mul_i32 s10, s1, s2
-; GFX12-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[6:7]
-; GFX12-NEXT: s_mul_hi_u32 s9, s1, s2
-; GFX12-NEXT: s_mul_hi_u32 s11, s1, s3
-; GFX12-NEXT: s_add_co_u32 s4, s6, s10
-; GFX12-NEXT: s_add_co_ci_u32 s4, s7, s9
-; GFX12-NEXT: s_mul_i32 s8, s1, s3
+; GFX12-NEXT: s_mul_hi_u32 s3, s4, s7
+; GFX12-NEXT: s_mul_i32 s2, s4, s7
+; GFX12-NEXT: s_mul_hi_u32 s0, s4, s6
+; GFX12-NEXT: s_mul_i32 s10, s5, s6
+; GFX12-NEXT: s_add_nc_u64 s[2:3], s[0:1], s[2:3]
+; GFX12-NEXT: s_mul_hi_u32 s9, s5, s6
+; GFX12-NEXT: s_mul_hi_u32 s11, s5, s7
+; GFX12-NEXT: s_add_co_u32 s0, s2, s10
+; GFX12-NEXT: s_add_co_ci_u32 s0, s3, s9
+; GFX12-NEXT: s_mul_i32 s8, s5, s7
; GFX12-NEXT: s_add_co_ci_u32 s9, s11, 0
-; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[8:9]
+; GFX12-NEXT: s_mul_u64 s[2:3], s[4:5], s[6:7]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[8:9]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX12-NEXT: s_cselect_b32 s0, 0, s0
-; GFX12-NEXT: s_cselect_b32 s1, 0, s1
+; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX12-NEXT: s_cselect_b32 s0, 0, s2
+; GFX12-NEXT: s_cselect_b32 s1, 0, s3
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off
; GFX12-NEXT: s_nop 0
@@ -540,81 +540,81 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
;
; GFX9-LABEL: smulo_i64_s:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s7, s0, s3
-; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX9-NEXT: s_mul_hi_u32 s5, s0, s3
-; GFX9-NEXT: s_add_u32 s9, s8, s7
-; GFX9-NEXT: s_mul_i32 s6, s1, s2
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX9-NEXT: s_add_u32 s9, s9, s6
-; GFX9-NEXT: s_mul_hi_i32 s10, s1, s3
-; GFX9-NEXT: s_addc_u32 s4, s5, s4
-; GFX9-NEXT: s_addc_u32 s5, s10, 0
-; GFX9-NEXT: s_mul_i32 s9, s1, s3
-; GFX9-NEXT: s_add_u32 s4, s4, s9
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_sub_u32 s9, s4, s2
-; GFX9-NEXT: s_subb_u32 s10, s5, 0
-; GFX9-NEXT: s_cmp_lt_i32 s1, 0
-; GFX9-NEXT: s_cselect_b32 s4, s9, s4
-; GFX9-NEXT: s_cselect_b32 s1, s10, s5
-; GFX9-NEXT: s_sub_u32 s9, s4, s0
-; GFX9-NEXT: s_subb_u32 s5, s1, 0
-; GFX9-NEXT: s_cmp_lt_i32 s3, 0
-; GFX9-NEXT: s_cselect_b32 s5, s5, s1
-; GFX9-NEXT: s_cselect_b32 s4, s9, s4
-; GFX9-NEXT: s_add_i32 s1, s8, s7
-; GFX9-NEXT: s_add_i32 s1, s1, s6
-; GFX9-NEXT: s_ashr_i32 s6, s1, 31
-; GFX9-NEXT: s_mov_b32 s7, s6
-; GFX9-NEXT: s_mul_i32 s0, s0, s2
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], s[6:7]
-; GFX9-NEXT: s_cselect_b32 s1, 0, s1
-; GFX9-NEXT: s_cselect_b32 s0, 0, s0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_mul_i32 s3, s4, s7
+; GFX9-NEXT: s_mul_hi_u32 s8, s4, s6
+; GFX9-NEXT: s_mul_hi_u32 s1, s4, s7
+; GFX9-NEXT: s_add_u32 s9, s8, s3
+; GFX9-NEXT: s_mul_i32 s2, s5, s6
+; GFX9-NEXT: s_addc_u32 s1, 0, s1
+; GFX9-NEXT: s_mul_hi_u32 s0, s5, s6
+; GFX9-NEXT: s_add_u32 s9, s9, s2
+; GFX9-NEXT: s_mul_hi_i32 s10, s5, s7
+; GFX9-NEXT: s_addc_u32 s0, s1, s0
+; GFX9-NEXT: s_addc_u32 s1, s10, 0
+; GFX9-NEXT: s_mul_i32 s9, s5, s7
+; GFX9-NEXT: s_add_u32 s0, s0, s9
+; GFX9-NEXT: s_addc_u32 s1, 0, s1
+; GFX9-NEXT: s_sub_u32 s9, s0, s6
+; GFX9-NEXT: s_subb_u32 s10, s1, 0
+; GFX9-NEXT: s_cmp_lt_i32 s5, 0
+; GFX9-NEXT: s_cselect_b32 s0, s9, s0
+; GFX9-NEXT: s_cselect_b32 s1, s10, s1
+; GFX9-NEXT: s_sub_u32 s5, s0, s4
+; GFX9-NEXT: s_subb_u32 s9, s1, 0
+; GFX9-NEXT: s_cmp_lt_i32 s7, 0
+; GFX9-NEXT: s_cselect_b32 s1, s9, s1
+; GFX9-NEXT: s_cselect_b32 s0, s5, s0
+; GFX9-NEXT: s_add_i32 s3, s8, s3
+; GFX9-NEXT: s_add_i32 s5, s3, s2
+; GFX9-NEXT: s_ashr_i32 s2, s5, 31
+; GFX9-NEXT: s_mov_b32 s3, s2
+; GFX9-NEXT: s_mul_i32 s4, s4, s6
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], s[2:3]
+; GFX9-NEXT: s_cselect_b32 s0, 0, s5
+; GFX9-NEXT: s_cselect_b32 s1, 0, s4
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: smulo_i64_s:
; GFX10: ; %bb.0: ; %bb
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mul_i32 s7, s0, s3
-; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX10-NEXT: s_mul_hi_u32 s5, s0, s3
-; GFX10-NEXT: s_mul_i32 s6, s1, s2
-; GFX10-NEXT: s_add_u32 s11, s8, s7
-; GFX10-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX10-NEXT: s_addc_u32 s5, 0, s5
-; GFX10-NEXT: s_mul_hi_i32 s9, s1, s3
-; GFX10-NEXT: s_add_u32 s11, s11, s6
-; GFX10-NEXT: s_mul_i32 s10, s1, s3
-; GFX10-NEXT: s_addc_u32 s4, s5, s4
-; GFX10-NEXT: s_addc_u32 s5, s9, 0
-; GFX10-NEXT: s_add_u32 s4, s4, s10
-; GFX10-NEXT: s_addc_u32 s5, 0, s5
-; GFX10-NEXT: s_sub_u32 s9, s4, s2
-; GFX10-NEXT: s_subb_u32 s10, s5, 0
-; GFX10-NEXT: s_cmp_lt_i32 s1, 0
-; GFX10-NEXT: s_cselect_b32 s1, s9, s4
-; GFX10-NEXT: s_cselect_b32 s4, s10, s5
-; GFX10-NEXT: s_sub_u32 s9, s1, s0
-; GFX10-NEXT: s_subb_u32 s5, s4, 0
-; GFX10-NEXT: s_cmp_lt_i32 s3, 0
-; GFX10-NEXT: s_mul_i32 s0, s0, s2
-; GFX10-NEXT: s_cselect_b32 s5, s5, s4
-; GFX10-NEXT: s_cselect_b32 s4, s9, s1
-; GFX10-NEXT: s_add_i32 s1, s8, s7
-; GFX10-NEXT: s_add_i32 s1, s1, s6
-; GFX10-NEXT: s_ashr_i32 s6, s1, 31
-; GFX10-NEXT: s_mov_b32 s7, s6
-; GFX10-NEXT: s_cmp_lg_u64 s[4:5], s[6:7]
-; GFX10-NEXT: s_cselect_b32 s0, 0, s0
-; GFX10-NEXT: s_cselect_b32 s1, 0, s1
+; GFX10-NEXT: s_mul_i32 s3, s4, s7
+; GFX10-NEXT: s_mul_hi_u32 s8, s4, s6
+; GFX10-NEXT: s_mul_hi_u32 s1, s4, s7
+; GFX10-NEXT: s_mul_i32 s2, s5, s6
+; GFX10-NEXT: s_add_u32 s11, s8, s3
+; GFX10-NEXT: s_mul_hi_u32 s0, s5, s6
+; GFX10-NEXT: s_addc_u32 s1, 0, s1
+; GFX10-NEXT: s_mul_hi_i32 s9, s5, s7
+; GFX10-NEXT: s_add_u32 s11, s11, s2
+; GFX10-NEXT: s_mul_i32 s10, s5, s7
+; GFX10-NEXT: s_addc_u32 s0, s1, s0
+; GFX10-NEXT: s_addc_u32 s1, s9, 0
+; GFX10-NEXT: s_add_u32 s0, s0, s10
+; GFX10-NEXT: s_addc_u32 s1, 0, s1
+; GFX10-NEXT: s_sub_u32 s9, s0, s6
+; GFX10-NEXT: s_subb_u32 s10, s1, 0
+; GFX10-NEXT: s_cmp_lt_i32 s5, 0
+; GFX10-NEXT: s_cselect_b32 s0, s9, s0
+; GFX10-NEXT: s_cselect_b32 s1, s10, s1
+; GFX10-NEXT: s_sub_u32 s5, s0, s4
+; GFX10-NEXT: s_subb_u32 s9, s1, 0
+; GFX10-NEXT: s_cmp_lt_i32 s7, 0
+; GFX10-NEXT: s_mul_i32 s4, s4, s6
+; GFX10-NEXT: s_cselect_b32 s1, s9, s1
+; GFX10-NEXT: s_cselect_b32 s0, s5, s0
+; GFX10-NEXT: s_add_i32 s3, s8, s3
+; GFX10-NEXT: s_add_i32 s5, s3, s2
+; GFX10-NEXT: s_ashr_i32 s2, s5, 31
+; GFX10-NEXT: s_mov_b32 s3, s2
+; GFX10-NEXT: s_cmp_lg_u64 s[0:1], s[2:3]
+; GFX10-NEXT: s_cselect_b32 s0, 0, s4
+; GFX10-NEXT: s_cselect_b32 s1, 0, s5
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
@@ -622,42 +622,42 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
;
; GFX11-LABEL: smulo_i64_s:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s7, s0, s3
-; GFX11-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX11-NEXT: s_mul_hi_u32 s5, s0, s3
-; GFX11-NEXT: s_mul_i32 s6, s1, s2
-; GFX11-NEXT: s_add_u32 s11, s8, s7
-; GFX11-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX11-NEXT: s_addc_u32 s5, 0, s5
-; GFX11-NEXT: s_mul_hi_i32 s9, s1, s3
-; GFX11-NEXT: s_add_u32 s11, s11, s6
-; GFX11-NEXT: s_mul_i32 s10, s1, s3
-; GFX11-NEXT: s_addc_u32 s4, s5, s4
-; GFX11-NEXT: s_addc_u32 s5, s9, 0
-; GFX11-NEXT: s_add_u32 s4, s4, s10
-; GFX11-NEXT: s_addc_u32 s5, 0, s5
-; GFX11-NEXT: s_sub_u32 s9, s4, s2
-; GFX11-NEXT: s_subb_u32 s10, s5, 0
-; GFX11-NEXT: s_cmp_lt_i32 s1, 0
-; GFX11-NEXT: s_cselect_b32 s1, s9, s4
-; GFX11-NEXT: s_cselect_b32 s4, s10, s5
-; GFX11-NEXT: s_sub_u32 s9, s1, s0
-; GFX11-NEXT: s_subb_u32 s5, s4, 0
-; GFX11-NEXT: s_cmp_lt_i32 s3, 0
-; GFX11-NEXT: s_mul_i32 s0, s0, s2
-; GFX11-NEXT: s_cselect_b32 s5, s5, s4
-; GFX11-NEXT: s_cselect_b32 s4, s9, s1
-; GFX11-NEXT: s_add_i32 s1, s8, s7
+; GFX11-NEXT: s_mul_i32 s3, s4, s7
+; GFX11-NEXT: s_mul_hi_u32 s8, s4, s6
+; GFX11-NEXT: s_mul_hi_u32 s1, s4, s7
+; GFX11-NEXT: s_mul_i32 s2, s5, s6
+; GFX11-NEXT: s_add_u32 s11, s8, s3
+; GFX11-NEXT: s_mul_hi_u32 s0, s5, s6
+; GFX11-NEXT: s_addc_u32 s1, 0, s1
+; GFX11-NEXT: s_mul_hi_i32 s9, s5, s7
+; GFX11-NEXT: s_add_u32 s11, s11, s2
+; GFX11-NEXT: s_mul_i32 s10, s5, s7
+; GFX11-NEXT: s_addc_u32 s0, s1, s0
+; GFX11-NEXT: s_addc_u32 s1, s9, 0
+; GFX11-NEXT: s_add_u32 s0, s0, s10
+; GFX11-NEXT: s_addc_u32 s1, 0, s1
+; GFX11-NEXT: s_sub_u32 s9, s0, s6
+; GFX11-NEXT: s_subb_u32 s10, s1, 0
+; GFX11-NEXT: s_cmp_lt_i32 s5, 0
+; GFX11-NEXT: s_cselect_b32 s0, s9, s0
+; GFX11-NEXT: s_cselect_b32 s1, s10, s1
+; GFX11-NEXT: s_sub_u32 s5, s0, s4
+; GFX11-NEXT: s_subb_u32 s9, s1, 0
+; GFX11-NEXT: s_cmp_lt_i32 s7, 0
+; GFX11-NEXT: s_mul_i32 s4, s4, s6
+; GFX11-NEXT: s_cselect_b32 s1, s9, s1
+; GFX11-NEXT: s_cselect_b32 s0, s5, s0
+; GFX11-NEXT: s_add_i32 s3, s8, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_i32 s1, s1, s6
-; GFX11-NEXT: s_ashr_i32 s6, s1, 31
+; GFX11-NEXT: s_add_i32 s5, s3, s2
+; GFX11-NEXT: s_ashr_i32 s2, s5, 31
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_mov_b32 s7, s6
-; GFX11-NEXT: s_cmp_lg_u64 s[4:5], s[6:7]
-; GFX11-NEXT: s_cselect_b32 s0, 0, s0
-; GFX11-NEXT: s_cselect_b32 s1, 0, s1
+; GFX11-NEXT: s_mov_b32 s3, s2
+; GFX11-NEXT: s_cmp_lg_u64 s[0:1], s[2:3]
+; GFX11-NEXT: s_cselect_b32 s0, 0, s4
+; GFX11-NEXT: s_cselect_b32 s1, 0, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
@@ -667,39 +667,39 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
;
; GFX12-LABEL: smulo_i64_s:
; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mul_hi_u32 s7, s0, s3
-; GFX12-NEXT: s_mul_i32 s6, s0, s3
-; GFX12-NEXT: s_mul_hi_u32 s4, s0, s2
-; GFX12-NEXT: s_mul_i32 s10, s1, s2
-; GFX12-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[6:7]
-; GFX12-NEXT: s_mul_hi_u32 s9, s1, s2
-; GFX12-NEXT: s_mul_hi_i32 s11, s1, s3
-; GFX12-NEXT: s_add_co_u32 s4, s6, s10
-; GFX12-NEXT: s_add_co_ci_u32 s4, s7, s9
-; GFX12-NEXT: s_mul_i32 s8, s1, s3
+; GFX12-NEXT: s_mul_hi_u32 s3, s4, s7
+; GFX12-NEXT: s_mul_i32 s2, s4, s7
+; GFX12-NEXT: s_mul_hi_u32 s0, s4, s6
+; GFX12-NEXT: s_mul_i32 s10, s5, s6
+; GFX12-NEXT: s_add_nc_u64 s[2:3], s[0:1], s[2:3]
+; GFX12-NEXT: s_mul_hi_u32 s9, s5, s6
+; GFX12-NEXT: s_mul_hi_i32 s11, s5, s7
+; GFX12-NEXT: s_add_co_u32 s0, s2, s10
+; GFX12-NEXT: s_add_co_ci_u32 s0, s3, s9
+; GFX12-NEXT: s_mul_i32 s8, s5, s7
; GFX12-NEXT: s_add_co_ci_u32 s9, s11, 0
-; GFX12-NEXT: s_cmp_lt_i32 s1, 0
-; GFX12-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[8:9]
-; GFX12-NEXT: s_mov_b32 s4, s2
+; GFX12-NEXT: s_cmp_lt_i32 s5, 0
+; GFX12-NEXT: s_add_nc_u64 s[2:3], s[0:1], s[8:9]
+; GFX12-NEXT: s_mov_b32 s0, s6
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_sub_nc_u64 s[8:9], s[6:7], s[4:5]
-; GFX12-NEXT: s_mov_b32 s4, s0
-; GFX12-NEXT: s_cselect_b32 s7, s9, s7
-; GFX12-NEXT: s_cselect_b32 s6, s8, s6
-; GFX12-NEXT: s_cmp_lt_i32 s3, 0
-; GFX12-NEXT: s_sub_nc_u64 s[4:5], s[6:7], s[4:5]
-; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-NEXT: s_cselect_b32 s3, s5, s7
-; GFX12-NEXT: s_cselect_b32 s2, s4, s6
-; GFX12-NEXT: s_ashr_i32 s4, s1, 31
+; GFX12-NEXT: s_sub_nc_u64 s[8:9], s[2:3], s[0:1]
+; GFX12-NEXT: s_mov_b32 s0, s4
+; GFX12-NEXT: s_cselect_b32 s3, s9, s3
+; GFX12-NEXT: s_cselect_b32 s2, s8, s2
+; GFX12-NEXT: s_cmp_lt_i32 s7, 0
+; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[6:7]
+; GFX12-NEXT: s_cselect_b32 s1, s1, s3
+; GFX12-NEXT: s_cselect_b32 s0, s0, s2
+; GFX12-NEXT: s_ashr_i32 s2, s5, 31
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s5, s4
-; GFX12-NEXT: s_cmp_lg_u64 s[2:3], s[4:5]
-; GFX12-NEXT: s_cselect_b32 s0, 0, s0
-; GFX12-NEXT: s_cselect_b32 s1, 0, s1
+; GFX12-NEXT: s_mov_b32 s3, s2
+; GFX12-NEXT: s_cmp_lg_u64 s[0:1], s[2:3]
+; GFX12-NEXT: s_cselect_b32 s0, 0, s4
+; GFX12-NEXT: s_cselect_b32 s1, 0, s5
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
index 9fcbdf3..27ea3e8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
@@ -136,12 +136,12 @@ define amdgpu_kernel void @local_size_xy(ptr addrspace(1) %out) {
; VI-LABEL: local_size_xy:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x18
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mul_i32 s2, s2, s3
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_mul_i32 s0, s2, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -179,14 +179,14 @@ define amdgpu_kernel void @local_size_xz(ptr addrspace(1) %out) {
;
; VI-LABEL: local_size_xz:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s2, s[0:1], 0x18
-; VI-NEXT: s_load_dword s3, s[0:1], 0x20
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x18
+; VI-NEXT: s_load_dword s5, s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mul_i32 s2, s2, s3
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_mul_i32 s0, s4, s5
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -269,14 +269,14 @@ define amdgpu_kernel void @local_size_xyz(ptr addrspace(1) %out) {
; VI-LABEL: local_size_xyz:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x18
-; VI-NEXT: s_load_dword s4, s[0:1], 0x20
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s6, s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mul_i32 s2, s2, s3
-; VI-NEXT: s_add_i32 s2, s2, s4
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_mul_i32 s0, s2, s3
+; VI-NEXT: s_add_i32 s0, s0, s6
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 84afa3b0..18c910a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -30,38 +30,38 @@ define amdgpu_kernel void @rint_f16(
;
; GFX89-LABEL: rint_f16:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_rndne_f16_e32 v0, v0
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: rint_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rndne_f16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -111,64 +111,64 @@ define amdgpu_kernel void @rint_v2f16(
;
; VI-LABEL: rint_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_rndne_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_rndne_f16_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: rint_v2f16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rndne_f16_e32 v1, v0
; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: rint_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_rndne_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rndne_f16_e32 v1, v1
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
index c5d2f79..d1e2008 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
@@ -24,60 +24,42 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 {
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
-; GFX8-LABEL: round_f32:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_trunc_f32_e32 v0, s6
-; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0
-; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
-; GFX8-NEXT: s_brev_b32 s4, -2
-; GFX8-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX8-NEXT: s_endpgm
-;
-; GFX9-LABEL: round_f32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_trunc_f32_e32 v0, s2
-; GFX9-NEXT: v_sub_f32_e32 v1, s2, v0
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX9-NEXT: s_brev_b32 s0, -2
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2
-; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; GFX9-NEXT: s_endpgm
+; GFX89-LABEL: round_f32:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX89-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: v_trunc_f32_e32 v0, s2
+; GFX89-NEXT: v_sub_f32_e32 v1, s2, v0
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX89-NEXT: s_brev_b32 s0, -2
+; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_bfi_b32 v1, s0, v1, v2
+; GFX89-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: round_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_trunc_f32_e32 v0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_sub_f32_e32 v1, s2, v0
-; GFX11-NEXT: v_cmp_ge_f32_e64 s3, |v1|, 0.5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s3
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v1|, 0.5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -253,90 +235,52 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) #
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
-; GFX8-LABEL: round_v4f32:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_brev_b32 s10, -2
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_trunc_f32_e32 v0, s7
-; GFX8-NEXT: v_sub_f32_e32 v1, s7, v0
-; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v2, s7
-; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2
-; GFX8-NEXT: v_add_f32_e32 v3, v0, v1
-; GFX8-NEXT: v_trunc_f32_e32 v0, s6
-; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0
-; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2
-; GFX8-NEXT: v_add_f32_e32 v2, v0, v1
-; GFX8-NEXT: v_trunc_f32_e32 v0, s5
-; GFX8-NEXT: v_sub_f32_e32 v1, s5, v0
-; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, s5
-; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v4
-; GFX8-NEXT: v_add_f32_e32 v1, v0, v1
-; GFX8-NEXT: v_trunc_f32_e32 v0, s4
-; GFX8-NEXT: v_sub_f32_e32 v4, s4, v0
-; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v5, s4
-; GFX8-NEXT: v_bfi_b32 v4, s10, v4, v5
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; GFX8-NEXT: s_endpgm
-;
-; GFX9-LABEL: round_v4f32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
-; GFX9-NEXT: s_brev_b32 s2, -2
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_trunc_f32_e32 v0, s7
-; GFX9-NEXT: v_sub_f32_e32 v1, s7, v0
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2
-; GFX9-NEXT: v_add_f32_e32 v3, v0, v1
-; GFX9-NEXT: v_trunc_f32_e32 v0, s6
-; GFX9-NEXT: v_sub_f32_e32 v1, s6, v0
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2
-; GFX9-NEXT: v_add_f32_e32 v2, v0, v1
-; GFX9-NEXT: v_trunc_f32_e32 v0, s5
-; GFX9-NEXT: v_sub_f32_e32 v1, s5, v0
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, s5
-; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v4
-; GFX9-NEXT: v_add_f32_e32 v1, v0, v1
-; GFX9-NEXT: v_trunc_f32_e32 v0, s4
-; GFX9-NEXT: v_sub_f32_e32 v4, s4, v0
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v5, s4
-; GFX9-NEXT: v_bfi_b32 v4, s2, v4, v5
-; GFX9-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
-; GFX9-NEXT: s_endpgm
+; GFX89-LABEL: round_v4f32:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX89-NEXT: s_brev_b32 s2, -2
+; GFX89-NEXT: s_mov_b32 s11, 0xf000
+; GFX89-NEXT: s_mov_b32 s10, -1
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: v_trunc_f32_e32 v0, s7
+; GFX89-NEXT: v_sub_f32_e32 v1, s7, v0
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX89-NEXT: v_mov_b32_e32 v2, s7
+; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v2
+; GFX89-NEXT: v_add_f32_e32 v3, v0, v1
+; GFX89-NEXT: v_trunc_f32_e32 v0, s6
+; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX89-NEXT: v_mov_b32_e32 v2, s6
+; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v2
+; GFX89-NEXT: v_add_f32_e32 v2, v0, v1
+; GFX89-NEXT: v_trunc_f32_e32 v0, s5
+; GFX89-NEXT: v_sub_f32_e32 v1, s5, v0
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX89-NEXT: v_mov_b32_e32 v4, s5
+; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v4
+; GFX89-NEXT: v_add_f32_e32 v1, v0, v1
+; GFX89-NEXT: v_trunc_f32_e32 v0, s4
+; GFX89-NEXT: v_sub_f32_e32 v4, s4, v0
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
+; GFX89-NEXT: v_mov_b32_e32 v5, s4
+; GFX89-NEXT: v_bfi_b32 v4, s2, v4, v5
+; GFX89-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: round_v4f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_trunc_f32_e32 v0, s7
; GFX11-NEXT: v_trunc_f32_e32 v1, s6
@@ -346,27 +290,26 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) #
; GFX11-NEXT: v_dual_sub_f32 v2, s7, v0 :: v_dual_sub_f32 v3, s6, v1
; GFX11-NEXT: v_dual_sub_f32 v6, s5, v4 :: v_dual_sub_f32 v7, s4, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v2|, 0.5
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s2
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v3|, 0.5
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v3|, 0.5
; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, s7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v6|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s0
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v6|, 0.5
; GFX11-NEXT: v_bfi_b32 v8, 0x7fffffff, v3, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v7|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1.0, s0
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v7|, 0.5
; GFX11-NEXT: v_dual_add_f32 v3, v0, v2 :: v_dual_add_f32 v2, v1, v8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfi_b32 v6, 0x7fffffff, v6, s5
-; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, s4
; GFX11-NEXT: v_dual_add_f32 v1, v4, v6 :: v_dual_add_f32 v0, v5, v7
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -481,77 +424,78 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) #
; GFX89-LABEL: round_v8f32:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; GFX89-NEXT: s_brev_b32 s14, -2
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s3, 0xf000
-; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
+; GFX89-NEXT: s_brev_b32 s2, -2
+; GFX89-NEXT: s_mov_b32 s15, 0xf000
+; GFX89-NEXT: s_mov_b32 s14, -1
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_trunc_f32_e32 v0, s7
; GFX89-NEXT: v_sub_f32_e32 v1, s7, v0
-; GFX89-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5
-; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13]
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
; GFX89-NEXT: v_mov_b32_e32 v2, s7
-; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v2
+; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v2
; GFX89-NEXT: v_add_f32_e32 v3, v0, v1
; GFX89-NEXT: v_trunc_f32_e32 v0, s6
; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0
-; GFX89-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5
-; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13]
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
; GFX89-NEXT: v_mov_b32_e32 v2, s6
-; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v2
+; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v2
; GFX89-NEXT: v_add_f32_e32 v2, v0, v1
; GFX89-NEXT: v_trunc_f32_e32 v0, s5
; GFX89-NEXT: v_sub_f32_e32 v1, s5, v0
-; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5
-; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7]
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
; GFX89-NEXT: v_mov_b32_e32 v4, s5
-; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v4
+; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v4
; GFX89-NEXT: v_add_f32_e32 v1, v0, v1
; GFX89-NEXT: v_trunc_f32_e32 v0, s4
; GFX89-NEXT: v_sub_f32_e32 v4, s4, v0
-; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5
-; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7]
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
; GFX89-NEXT: v_mov_b32_e32 v5, s4
-; GFX89-NEXT: v_bfi_b32 v4, s14, v4, v5
+; GFX89-NEXT: v_bfi_b32 v4, s2, v4, v5
; GFX89-NEXT: v_add_f32_e32 v0, v0, v4
; GFX89-NEXT: v_trunc_f32_e32 v4, s11
; GFX89-NEXT: v_sub_f32_e32 v5, s11, v4
-; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5
-; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5]
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
; GFX89-NEXT: v_mov_b32_e32 v6, s11
-; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v6
+; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v6
; GFX89-NEXT: v_add_f32_e32 v7, v4, v5
; GFX89-NEXT: v_trunc_f32_e32 v4, s10
; GFX89-NEXT: v_sub_f32_e32 v5, s10, v4
-; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5
-; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5]
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
; GFX89-NEXT: v_mov_b32_e32 v6, s10
-; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v6
+; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v6
; GFX89-NEXT: v_add_f32_e32 v6, v4, v5
; GFX89-NEXT: v_trunc_f32_e32 v4, s9
; GFX89-NEXT: v_sub_f32_e32 v5, s9, v4
-; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5
-; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5]
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
; GFX89-NEXT: v_mov_b32_e32 v8, s9
-; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v8
+; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v8
; GFX89-NEXT: v_add_f32_e32 v5, v4, v5
; GFX89-NEXT: v_trunc_f32_e32 v4, s8
; GFX89-NEXT: v_sub_f32_e32 v8, s8, v4
-; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5
-; GFX89-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5]
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1]
; GFX89-NEXT: v_mov_b32_e32 v9, s8
-; GFX89-NEXT: v_bfi_b32 v8, s14, v8, v9
+; GFX89-NEXT: v_bfi_b32 v8, s2, v8, v9
; GFX89-NEXT: v_add_f32_e32 v4, v4, v8
-; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
+; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: round_v8f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x44
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b64 s[12:13], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s15, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_trunc_f32_e32 v0, s7
; GFX11-NEXT: v_trunc_f32_e32 v1, s6
@@ -564,57 +508,56 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) #
; GFX11-NEXT: v_trunc_f32_e32 v9, s9
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_sub_f32_e32 v12, s11, v5
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v2|, 0.5
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5
; GFX11-NEXT: v_sub_f32_e32 v11, s4, v8
; GFX11-NEXT: v_trunc_f32_e32 v6, s10
; GFX11-NEXT: v_sub_f32_e32 v14, s9, v9
; GFX11-NEXT: v_trunc_f32_e32 v10, s8
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v3|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v3|, 0.5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, s7
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v7|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s0
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v7|, 0.5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfi_b32 v16, 0x7fffffff, v3, s6
-; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v11|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s0
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v11|, 0.5
; GFX11-NEXT: v_sub_f32_e32 v13, s10, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_dual_add_f32 v3, v0, v2 :: v_dual_add_f32 v2, v1, v16
; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, s5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v12|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1.0, s0
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v12|, 0.5
; GFX11-NEXT: v_add_f32_e32 v1, v4, v7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfi_b32 v11, 0x7fffffff, v11, s4
-; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v13|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, 1.0, s0
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v13|, 0.5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfi_b32 v12, 0x7fffffff, v12, s11
-; GFX11-NEXT: v_cndmask_b32_e64 v13, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v14|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v13, 0, 1.0, s0
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v14|, 0.5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_f32_e32 v7, v5, v12
; GFX11-NEXT: v_bfi_b32 v13, 0x7fffffff, v13, s10
; GFX11-NEXT: v_sub_f32_e32 v15, s8, v10
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v14, 0, 1.0, s2
+; GFX11-NEXT: v_cndmask_b32_e64 v14, 0, 1.0, s0
; GFX11-NEXT: v_add_f32_e32 v6, v6, v13
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v15|, 0.5
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v15|, 0.5
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v14, s9
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v15, 0, 1.0, s2
+; GFX11-NEXT: v_cndmask_b32_e64 v15, 0, 1.0, s0
; GFX11-NEXT: v_dual_add_f32 v5, v9, v0 :: v_dual_add_f32 v0, v8, v11
-; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v4, 0x7fffffff, v15, s8
; GFX11-NEXT: v_add_f32_e32 v4, v10, v4
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[12:15], 0 offset:16
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[12:15], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -702,62 +645,43 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 {
; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
-; GFX8-LABEL: round_f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00
-; GFX8-NEXT: s_movk_i32 s5, 0x7fff
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_trunc_f16_e32 v1, s4
-; GFX8-NEXT: v_sub_f16_e32 v2, s4, v1
-; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_bfi_b32 v0, s5, v0, v2
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_add_f16_e32 v0, v1, v0
-; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0
-; GFX8-NEXT: s_endpgm
-;
-; GFX9-LABEL: round_f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00
-; GFX9-NEXT: s_movk_i32 s0, 0x7fff
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_trunc_f16_e32 v1, s2
-; GFX9-NEXT: v_sub_f16_e32 v2, s2, v1
-; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v2
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: v_add_f16_e32 v0, v1, v0
-; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
-; GFX9-NEXT: s_endpgm
+; GFX89-LABEL: round_f16:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX89-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c00
+; GFX89-NEXT: s_movk_i32 s0, 0x7fff
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: v_trunc_f16_e32 v1, s2
+; GFX89-NEXT: v_sub_f16_e32 v2, s2, v1
+; GFX89-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_bfi_b32 v0, s0, v0, v2
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: v_add_f16_e32 v0, v1, v0
+; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: round_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_trunc_f16_e32 v0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_sub_f16_e32 v1, s2, v0
-; GFX11-NEXT: v_cmp_ge_f16_e64 s3, |v1|, 0.5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x3c00, s3
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: v_cmp_ge_f16_e64 s0, |v1|, 0.5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x3c00, s0
; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, v1, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_f16_e32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -828,30 +752,30 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 {
;
; GFX8-LABEL: round_v2f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00
-; GFX8-NEXT: s_movk_i32 s6, 0x7fff
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_movk_i32 s1, 0x7fff
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshr_b32 s5, s4, 16
-; GFX8-NEXT: v_trunc_f16_e32 v1, s5
-; GFX8-NEXT: v_sub_f16_e32 v2, s5, v1
+; GFX8-NEXT: s_lshr_b32 s0, s2, 16
+; GFX8-NEXT: v_trunc_f16_e32 v1, s0
+; GFX8-NEXT: v_sub_f16_e32 v2, s0, v1
; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: v_bfi_b32 v2, s6, v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NEXT: v_bfi_b32 v2, s1, v2, v3
; GFX8-NEXT: v_add_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_trunc_f16_e32 v2, s4
-; GFX8-NEXT: v_sub_f16_e32 v3, s4, v2
+; GFX8-NEXT: v_trunc_f16_e32 v2, s2
+; GFX8-NEXT: v_sub_f16_e32 v3, s2, v2
; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v3|, 0.5
; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s4
-; GFX8-NEXT: v_bfi_b32 v0, s6, v0, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_bfi_b32 v0, s1, v0, v3
; GFX8-NEXT: v_add_f16_e32 v0, v2, v0
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: round_v2f16:
@@ -886,7 +810,9 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshr_b32 s3, s2, 16
; GFX11-NEXT: v_trunc_f16_e32 v1, s2
@@ -895,22 +821,20 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 {
; GFX11-NEXT: v_sub_f16_e32 v3, s2, v1
; GFX11-NEXT: v_sub_f16_e32 v2, s3, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v2|, 0.5
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s4
+; GFX11-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v3|, 0.5
+; GFX11-NEXT: v_cmp_ge_f16_e64 s0, |v3|, 0.5
; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, v2, s3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x3c00, s4
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x3c00, s0
; GFX11-NEXT: v_add_f16_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, v3, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: v_add_f16_e32 v1, v1, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index 2ce0a62..4082ad7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -30,55 +30,55 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
;
; GFX8-LABEL: sin_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
; GFX8-NEXT: v_fract_f16_e32 v0, v0
; GFX8-NEXT: v_sin_f16_e32 v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sin_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
; GFX9-NEXT: v_sin_f16_e32 v1, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sin_f16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
; GFX10-NEXT: v_sin_f16_e32 v1, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: sin_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_sin_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -121,10 +121,10 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
;
; GFX8-LABEL: sin_v2f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v1, 0x3118
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -134,50 +134,50 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX8-NEXT: v_fract_f16_e32 v0, v0
; GFX8-NEXT: v_sin_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_sin_f16_e32 v3, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sin_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_sin_f16_e32 v2, v3
; GFX9-NEXT: v_sin_f16_e32 v1, v1
; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sin_v2f16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_sin_f16_e32 v2, v3
; GFX10-NEXT: v_sin_f16_e32 v1, v1
; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: sin_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
@@ -188,7 +188,7 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX11-NEXT: v_sin_f16_e32 v2, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
index f2d57ba9..dc19189 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
@@ -29,38 +29,38 @@ define amdgpu_kernel void @sqrt_f16(
;
; VI-LABEL: sqrt_f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sqrt_f16_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: sqrt_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -109,37 +109,37 @@ define amdgpu_kernel void @sqrt_v2f16(
;
; VI-LABEL: sqrt_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sqrt_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_sqrt_f16_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: sqrt_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
@@ -147,7 +147,7 @@ define amdgpu_kernel void @sqrt_v2f16(
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
index d1e2ddc..3fb1699 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
@@ -29,38 +29,38 @@ define amdgpu_kernel void @trunc_f16(
;
; VI-LABEL: trunc_f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: trunc_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_trunc_f16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -110,44 +110,44 @@ define amdgpu_kernel void @trunc_v2f16(
;
; VI-LABEL: trunc_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_trunc_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_trunc_f16_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: trunc_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_trunc_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_trunc_f16_e32 v1, v1
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
index cfaefca..9de4eae 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -34,26 +34,26 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac
;
; GFX8-NOHSA-LABEL: constant_load_f64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
; GFX12-LABEL: constant_load_f64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 502cd14..876c246 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -25,13 +25,13 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace
;
; GFX8-LABEL: constant_load_i1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
@@ -65,14 +65,14 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace
;
; GFX12-LABEL: constant_load_i1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s2, s2, 1
+; GFX12-NEXT: s_and_b32 s0, s0, 1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b8 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -101,13 +101,13 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-LABEL: constant_load_v2i1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -140,12 +140,12 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX12-LABEL: constant_load_v2i1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b8 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -174,13 +174,13 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-LABEL: constant_load_v3i1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -212,12 +212,12 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX12-LABEL: constant_load_v3i1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b8 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -246,13 +246,13 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-LABEL: constant_load_v4i1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -285,12 +285,12 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX12-LABEL: constant_load_v4i1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b8 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -319,13 +319,13 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-LABEL: constant_load_v8i1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -358,12 +358,12 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX12-LABEL: constant_load_v8i1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b8 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -392,13 +392,13 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: constant_load_v16i1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -431,12 +431,12 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v16i1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -460,13 +460,13 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: constant_load_v32i1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -488,12 +488,12 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v32i1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -518,14 +518,14 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: constant_load_v64i1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
@@ -547,13 +547,13 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v64i1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -582,13 +582,13 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: constant_zextload_i1_to_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -611,12 +611,12 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_zextload_i1_to_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -647,13 +647,13 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: constant_sextload_i1_to_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -678,14 +678,14 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_sextload_i1_to_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GFX12-NEXT: s_bfe_i32 s0, s0, 0x10000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -715,13 +715,13 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v1i1_to_v1i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -744,12 +744,12 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v1i1_to_v1i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -780,13 +780,13 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v1i1_to_v1i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -811,14 +811,14 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v1i1_to_v1i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GFX12-NEXT: s_bfe_i32 s0, s0, 0x10000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -850,13 +850,13 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v2i1_to_v2i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v4, 1, v2
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v2
@@ -884,17 +884,17 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v2i1_to_v2i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v2, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0
; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -926,13 +926,13 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v2i1_to_v2i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v2
; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1
@@ -961,16 +961,16 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v2i1_to_v2i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v2, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 1
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1004,13 +1004,13 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v3i1_to_v3i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s0
-; GFX8-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0
; GFX8-NEXT: v_and_b32_e32 v5, 1, v0
@@ -1046,10 +1046,10 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v3i1_to_v3i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v3, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v3, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v3, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0
; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0
@@ -1060,7 +1060,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1094,13 +1094,13 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v3i1_to_v3i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s0
-; GFX8-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 2, v0
@@ -1137,10 +1137,10 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v3i1_to_v3i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v3, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v3, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v3, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 2, v0
; GFX12-NEXT: v_lshrrev_b16 v4, 1, v0
@@ -1148,7 +1148,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 1
; GFX12-NEXT: v_bfe_i32 v1, v4, 0, 1
-; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1182,13 +1182,13 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v4i1_to_v4i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 2, v0
@@ -1226,10 +1226,10 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v4i1_to_v4i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v4, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v4, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 2, v0
; GFX12-NEXT: v_lshrrev_b16 v2, 1, v0
@@ -1244,7 +1244,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v1
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v5
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1278,13 +1278,13 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v4i1_to_v4i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 2, v0
@@ -1323,10 +1323,10 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v4i1_to_v4i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v4, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v4, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 3, v0
; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0
@@ -1337,7 +1337,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out
; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-NEXT: v_bfe_i32 v1, v5, 0, 1
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1376,17 +1376,17 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v8i1_to_v8i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v1, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v11, s3
-; GFX8-NEXT: v_mov_b32_e32 v9, s1
-; GFX8-NEXT: v_mov_b32_e32 v10, s2
-; GFX8-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v11, s1
+; GFX8-NEXT: v_mov_b32_e32 v9, s5
+; GFX8-NEXT: v_mov_b32_e32 v10, s0
+; GFX8-NEXT: v_mov_b32_e32 v8, s4
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 5, v1
; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v1
@@ -1443,10 +1443,10 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v8i1_to_v8i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v8, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v8, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v2, 5, v0
; GFX12-NEXT: v_lshrrev_b16 v5, 1, v0
@@ -1467,8 +1467,8 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v9
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v10
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1507,17 +1507,17 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v8i1_to_v8i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v11, s3
-; GFX8-NEXT: v_mov_b32_e32 v9, s1
-; GFX8-NEXT: v_mov_b32_e32 v10, s2
-; GFX8-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v11, s1
+; GFX8-NEXT: v_mov_b32_e32 v9, s5
+; GFX8-NEXT: v_mov_b32_e32 v10, s0
+; GFX8-NEXT: v_mov_b32_e32 v8, s4
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v4, 4, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v5, 5, v0
@@ -1578,10 +1578,10 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v8i1_to_v8i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v8, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v8, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 4, v0
; GFX12-NEXT: v_lshrrev_b16 v4, 5, v0
@@ -1599,8 +1599,8 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out
; GFX12-NEXT: v_bfe_i32 v4, v1, 0, 1
; GFX12-NEXT: v_bfe_i32 v1, v9, 0, 1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1649,25 +1649,25 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_zextload_v16i1_to_v16i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v1, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v19, s3
-; GFX8-NEXT: v_mov_b32_e32 v18, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v19, s1
+; GFX8-NEXT: v_mov_b32_e32 v18, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, 1
-; GFX8-NEXT: v_mov_b32_e32 v17, s1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v16, s0
-; GFX8-NEXT: s_add_u32 s0, s0, 16
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
+; GFX8-NEXT: v_mov_b32_e32 v21, s1
+; GFX8-NEXT: v_mov_b32_e32 v20, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v17, s5
; GFX8-NEXT: v_mov_b32_e32 v23, s1
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
+; GFX8-NEXT: v_mov_b32_e32 v16, s4
; GFX8-NEXT: v_mov_b32_e32 v22, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 12, v1
@@ -1767,10 +1767,10 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v16i1_to_v16i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v16, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v16, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v16, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v2, 13, v0
; GFX12-NEXT: v_lshrrev_b16 v13, 1, v0
@@ -1811,10 +1811,10 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v22
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1863,24 +1863,24 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_sextload_v16i1_to_v16i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v19, s3
-; GFX8-NEXT: v_mov_b32_e32 v18, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: v_mov_b32_e32 v17, s1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v16, s0
-; GFX8-NEXT: s_add_u32 s0, s0, 16
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
+; GFX8-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v19, s1
+; GFX8-NEXT: v_mov_b32_e32 v18, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v21, s1
+; GFX8-NEXT: v_mov_b32_e32 v20, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v17, s5
; GFX8-NEXT: v_mov_b32_e32 v23, s1
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
+; GFX8-NEXT: v_mov_b32_e32 v16, s4
; GFX8-NEXT: v_mov_b32_e32 v22, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v12, 12, v0
@@ -1990,10 +1990,10 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v16i1_to_v16i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v16, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v16, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v16, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 12, v0
; GFX12-NEXT: v_lshrrev_b16 v4, 13, v0
@@ -2027,10 +2027,10 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1
; GFX12-NEXT: v_bfe_i32 v1, v19, 0, 1
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2132,112 +2132,112 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_zextload_v32i1_to_v32i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s4
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s2
; GFX8-NEXT: v_and_b32_e32 v24, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s4
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s2
; GFX8-NEXT: v_and_b32_e32 v22, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 7, s4
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 7, s2
; GFX8-NEXT: v_and_b32_e32 v15, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s4
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s2
; GFX8-NEXT: v_and_b32_e32 v23, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s4
-; GFX8-NEXT: s_lshr_b32 s2, s4, 24
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 9, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 11, s4
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s2
+; GFX8-NEXT: s_lshr_b32 s0, s2, 24
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 9, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v3, 11, s2
; GFX8-NEXT: v_and_b32_e32 v26, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s0
; GFX8-NEXT: v_and_b32_e32 v17, 1, v2
; GFX8-NEXT: v_and_b32_e32 v18, 1, v3
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 4, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v4, 4, s0
; GFX8-NEXT: v_and_b32_e32 v5, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 2, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 3, s2
-; GFX8-NEXT: s_bfe_u32 s5, s4, 0x10018
-; GFX8-NEXT: v_lshrrev_b16_e64 v7, 7, s2
-; GFX8-NEXT: s_and_b32 s6, s4, 1
-; GFX8-NEXT: s_bfe_u32 s7, s4, 0x10013
-; GFX8-NEXT: s_bfe_u32 s8, s4, 0x10012
-; GFX8-NEXT: s_bfe_u32 s9, s4, 0x10011
-; GFX8-NEXT: s_bfe_u32 s10, s4, 0x10010
-; GFX8-NEXT: s_bfe_u32 s2, s4, 0x10017
-; GFX8-NEXT: s_bfe_u32 s3, s4, 0x10016
-; GFX8-NEXT: s_bfe_u32 s11, s4, 0x10015
-; GFX8-NEXT: s_bfe_u32 s12, s4, 0x10014
-; GFX8-NEXT: v_mov_b32_e32 v11, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0x50
-; GFX8-NEXT: v_mov_b32_e32 v10, s3
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v13, s3
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 64
+; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 2, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v3, 3, s0
+; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10018
+; GFX8-NEXT: v_lshrrev_b16_e64 v7, 7, s0
+; GFX8-NEXT: s_and_b32 s6, s2, 1
+; GFX8-NEXT: s_bfe_u32 s7, s2, 0x10013
+; GFX8-NEXT: s_bfe_u32 s8, s2, 0x10012
+; GFX8-NEXT: s_bfe_u32 s9, s2, 0x10011
+; GFX8-NEXT: s_bfe_u32 s10, s2, 0x10010
+; GFX8-NEXT: s_bfe_u32 s0, s2, 0x10017
+; GFX8-NEXT: s_bfe_u32 s1, s2, 0x10016
+; GFX8-NEXT: s_bfe_u32 s11, s2, 0x10015
+; GFX8-NEXT: s_bfe_u32 s12, s2, 0x10014
+; GFX8-NEXT: v_mov_b32_e32 v11, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x50
+; GFX8-NEXT: v_mov_b32_e32 v10, s1
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v13, s1
+; GFX8-NEXT: v_mov_b32_e32 v12, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 64
; GFX8-NEXT: v_mov_b32_e32 v8, s12
; GFX8-NEXT: v_mov_b32_e32 v9, s11
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NEXT: v_mov_b32_e32 v13, s3
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: v_lshrrev_b16_e64 v21, 14, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v25, 2, s4
+; GFX8-NEXT: v_mov_b32_e32 v13, s1
+; GFX8-NEXT: v_mov_b32_e32 v12, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NEXT: v_lshrrev_b16_e64 v21, 14, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v25, 2, s2
; GFX8-NEXT: v_mov_b32_e32 v8, s10
; GFX8-NEXT: v_mov_b32_e32 v9, s9
; GFX8-NEXT: v_mov_b32_e32 v10, s8
; GFX8-NEXT: v_mov_b32_e32 v11, s7
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v22
; GFX8-NEXT: v_and_b32_e32 v10, 1, v25
; GFX8-NEXT: v_and_b32_e32 v22, 1, v21
; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v24
-; GFX8-NEXT: v_mov_b32_e32 v25, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 12, s4
-; GFX8-NEXT: v_mov_b32_e32 v24, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NEXT: v_mov_b32_e32 v25, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 12, s2
+; GFX8-NEXT: v_mov_b32_e32 v24, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v23
-; GFX8-NEXT: v_lshrrev_b16_e64 v23, 15, s4
+; GFX8-NEXT: v_lshrrev_b16_e64 v23, 15, s2
; GFX8-NEXT: v_and_b32_e32 v20, 1, v20
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23]
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 4, s4
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v21, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v19, 4, s2
; GFX8-NEXT: v_mov_b32_e32 v8, 1
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NEXT: v_mov_b32_e32 v20, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
; GFX8-NEXT: v_and_b32_e32 v12, 1, v19
; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v18
; GFX8-NEXT: v_and_b32_e32 v18, 1, v16
; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v17
; GFX8-NEXT: v_and_b32_sdwa v16, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s4
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s2
; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX8-NEXT: v_mov_b32_e32 v17, s3
+; GFX8-NEXT: v_mov_b32_e32 v17, s1
; GFX8-NEXT: v_and_b32_e32 v14, 1, v14
-; GFX8-NEXT: v_mov_b32_e32 v16, s2
+; GFX8-NEXT: v_mov_b32_e32 v16, s0
; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
-; GFX8-NEXT: s_add_u32 s2, s0, 0x70
-; GFX8-NEXT: v_mov_b32_e32 v13, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x70
+; GFX8-NEXT: v_mov_b32_e32 v13, s5
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v26
; GFX8-NEXT: v_mov_b32_e32 v8, s6
-; GFX8-NEXT: v_mov_b32_e32 v12, s0
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v12, s4
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NEXT: s_add_u32 s0, s0, 0x60
-; GFX8-NEXT: v_mov_b32_e32 v9, s3
; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX8-NEXT: v_mov_b32_e32 v9, s1
+; GFX8-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x60
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT: v_mov_b32_e32 v8, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
@@ -2245,7 +2245,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s5
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
@@ -2349,56 +2349,56 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v32i1_to_v32i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s3, s2, 24
-; GFX12-NEXT: v_lshrrev_b16 v1, 13, s2
-; GFX12-NEXT: v_lshrrev_b16 v2, 9, s2
-; GFX12-NEXT: v_lshrrev_b16 v4, 11, s2
-; GFX12-NEXT: v_lshrrev_b16 v6, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v9, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v13, 3, s2
-; GFX12-NEXT: v_lshrrev_b16 v14, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v18, 1, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 3, s3
-; GFX12-NEXT: v_lshrrev_b16 v10, 1, s2
+; GFX12-NEXT: s_lshr_b32 s1, s0, 24
+; GFX12-NEXT: v_lshrrev_b16 v1, 13, s0
+; GFX12-NEXT: v_lshrrev_b16 v2, 9, s0
+; GFX12-NEXT: v_lshrrev_b16 v4, 11, s0
+; GFX12-NEXT: v_lshrrev_b16 v6, 5, s0
+; GFX12-NEXT: v_lshrrev_b16 v9, 7, s0
+; GFX12-NEXT: v_lshrrev_b16 v13, 3, s0
+; GFX12-NEXT: v_lshrrev_b16 v14, 5, s1
+; GFX12-NEXT: v_lshrrev_b16 v18, 1, s1
+; GFX12-NEXT: v_lshrrev_b16 v21, 3, s1
+; GFX12-NEXT: v_lshrrev_b16 v10, 1, s0
; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_and_b32 v33, 1, v1
-; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2
-; GFX12-NEXT: v_lshrrev_b16 v12, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v20, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v3, 10, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 4, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 6, s2
-; GFX12-NEXT: v_lshrrev_b16 v11, 2, s2
-; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018
-; GFX12-NEXT: s_and_b32 s5, s2, 1
-; GFX12-NEXT: v_lshrrev_b16 v15, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v16, 6, s3
-; GFX12-NEXT: v_lshrrev_b16 v17, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v19, 2, s3
+; GFX12-NEXT: v_lshrrev_b16 v0, 12, s0
+; GFX12-NEXT: v_lshrrev_b16 v12, 14, s0
+; GFX12-NEXT: v_lshrrev_b16 v20, 15, s0
+; GFX12-NEXT: v_lshrrev_b16 v8, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v3, 10, s0
+; GFX12-NEXT: v_lshrrev_b16 v5, 4, s0
+; GFX12-NEXT: v_lshrrev_b16 v7, 6, s0
+; GFX12-NEXT: v_lshrrev_b16 v11, 2, s0
+; GFX12-NEXT: s_bfe_u32 s2, s0, 0x10018
+; GFX12-NEXT: s_and_b32 s3, s0, 1
+; GFX12-NEXT: v_lshrrev_b16 v15, 4, s1
+; GFX12-NEXT: v_lshrrev_b16 v16, 6, s1
+; GFX12-NEXT: v_lshrrev_b16 v17, 7, s1
+; GFX12-NEXT: v_lshrrev_b16 v19, 2, s1
; GFX12-NEXT: v_and_b32_e32 v25, 1, v14
; GFX12-NEXT: v_and_b32_e32 v26, 1, v18
; GFX12-NEXT: v_and_b32_e32 v21, 1, v21
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10013
-; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10012
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10013
+; GFX12-NEXT: s_bfe_u32 s6, s0, 0x10012
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v30, s6 :: v_dual_and_b32 v13, 1, v13
-; GFX12-NEXT: s_bfe_u32 s7, s2, 0x10011
-; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10010
-; GFX12-NEXT: s_bfe_u32 s9, s2, 0x10017
+; GFX12-NEXT: s_bfe_u32 s7, s0, 0x10011
+; GFX12-NEXT: s_bfe_u32 s8, s0, 0x10010
+; GFX12-NEXT: s_bfe_u32 s9, s0, 0x10017
; GFX12-NEXT: v_dual_mov_b32 v27, s9 :: v_dual_and_b32 v24, 1, v6
-; GFX12-NEXT: s_bfe_u32 s10, s2, 0x10016
+; GFX12-NEXT: s_bfe_u32 s10, s0, 0x10016
; GFX12-NEXT: v_and_b32_e32 v9, 1, v9
-; GFX12-NEXT: s_bfe_u32 s11, s2, 0x10014
+; GFX12-NEXT: s_bfe_u32 s11, s0, 0x10014
; GFX12-NEXT: v_and_b32_e32 v23, 1, v4
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10015
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x10015
; GFX12-NEXT: v_and_b32_e32 v22, 1, v2
; GFX12-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_and_b32 v1, 1, v10
; GFX12-NEXT: v_dual_mov_b32 v29, s7 :: v_dual_and_b32 v2, 1, v11
-; GFX12-NEXT: v_dual_mov_b32 v31, s3 :: v_dual_and_b32 v6, 1, v7
+; GFX12-NEXT: v_dual_mov_b32 v31, s1 :: v_dual_and_b32 v6, 1, v7
; GFX12-NEXT: v_and_b32_e32 v4, 1, v5
; GFX12-NEXT: v_and_b32_e32 v10, 1, v3
; GFX12-NEXT: v_and_b32_e32 v14, 1, v19
@@ -2412,23 +2412,23 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v26, s10 :: v_dual_and_b32 v23, 0xffff, v20
; GFX12-NEXT: v_and_b32_e32 v7, 0xffff, v9
; GFX12-NEXT: v_and_b32_e32 v20, 1, v0
-; GFX12-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_and_b32 v17, 0xffff, v25
-; GFX12-NEXT: v_mov_b32_e32 v25, s2
+; GFX12-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_and_b32 v17, 0xffff, v25
+; GFX12-NEXT: v_mov_b32_e32 v25, s0
; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v22
; GFX12-NEXT: v_and_b32_e32 v22, 1, v12
-; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_and_b32 v15, 0xffff, v21
+; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_and_b32 v15, 0xffff, v21
; GFX12-NEXT: v_and_b32_e32 v21, 0xffff, v33
; GFX12-NEXT: v_and_b32_e32 v8, 1, v8
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-NEXT: s_clause 0x7
-; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1]
-; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v32, v[24:27], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v32, v[28:31], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v32, v[20:23], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v32, v[8:11], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v32, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v32, v[0:3], s[4:5]
+; GFX12-NEXT: global_store_b128 v32, v[16:19], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v32, v[12:15], s[4:5] offset:96
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2530,111 +2530,111 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_sextload_v32i1_to_v32i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshr_b32 s3, s2, 24
-; GFX8-NEXT: v_lshrrev_b16_e64 v8, 12, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 13, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v21, 14, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v22, 15, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v16, 8, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v17, 9, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v18, 10, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 11, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v12, 4, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v13, 5, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v15, 7, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v9, 1, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v10, 2, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v11, 3, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v23, 4, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v24, 5, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v7, 7, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v25, 1, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v26, 2, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v27, 3, s3
-; GFX8-NEXT: s_bfe_i32 s4, s2, 0x10018
-; GFX8-NEXT: s_bfe_i32 s5, s2, 0x10000
-; GFX8-NEXT: s_bfe_i32 s6, s2, 0x10013
-; GFX8-NEXT: s_bfe_i32 s7, s2, 0x10012
-; GFX8-NEXT: s_bfe_i32 s8, s2, 0x10011
-; GFX8-NEXT: s_bfe_i32 s9, s2, 0x10010
-; GFX8-NEXT: s_bfe_i32 s3, s2, 0x10017
-; GFX8-NEXT: s_bfe_i32 s10, s2, 0x10016
-; GFX8-NEXT: s_bfe_i32 s11, s2, 0x10015
-; GFX8-NEXT: s_bfe_i32 s2, s2, 0x10014
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0x50
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 64
+; GFX8-NEXT: s_lshr_b32 s1, s0, 24
+; GFX8-NEXT: v_lshrrev_b16_e64 v8, 12, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 13, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v21, 14, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v22, 15, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v16, 8, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v17, 9, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v18, 10, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v19, 11, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v12, 4, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v13, 5, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v15, 7, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v9, 1, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v10, 2, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v11, 3, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v23, 4, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v24, 5, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v7, 7, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v25, 1, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v26, 2, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v27, 3, s1
+; GFX8-NEXT: s_bfe_i32 s2, s0, 0x10018
+; GFX8-NEXT: s_bfe_i32 s3, s0, 0x10000
+; GFX8-NEXT: s_bfe_i32 s6, s0, 0x10013
+; GFX8-NEXT: s_bfe_i32 s7, s0, 0x10012
+; GFX8-NEXT: s_bfe_i32 s8, s0, 0x10011
+; GFX8-NEXT: s_bfe_i32 s9, s0, 0x10010
+; GFX8-NEXT: s_bfe_i32 s1, s0, 0x10017
+; GFX8-NEXT: s_bfe_i32 s10, s0, 0x10016
+; GFX8-NEXT: s_bfe_i32 s11, s0, 0x10015
+; GFX8-NEXT: s_bfe_i32 s0, s0, 0x10014
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x50
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 64
; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: v_mov_b32_e32 v2, s10
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 48
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 48
; GFX8-NEXT: v_mov_b32_e32 v0, s9
; GFX8-NEXT: v_mov_b32_e32 v1, s8
; GFX8-NEXT: v_mov_b32_e32 v2, s7
; GFX8-NEXT: v_mov_b32_e32 v3, s6
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_bfe_i32 v5, v24, 0, 1
; GFX8-NEXT: v_bfe_i32 v1, v25, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v25, s3
-; GFX8-NEXT: v_mov_b32_e32 v24, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NEXT: v_mov_b32_e32 v25, s1
+; GFX8-NEXT: v_mov_b32_e32 v24, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
; GFX8-NEXT: v_bfe_i32 v4, v23, 0, 1
; GFX8-NEXT: v_bfe_i32 v23, v22, 0, 1
; GFX8-NEXT: v_bfe_i32 v22, v21, 0, 1
; GFX8-NEXT: v_bfe_i32 v21, v20, 0, 1
; GFX8-NEXT: v_bfe_i32 v20, v8, 0, 1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23]
; GFX8-NEXT: v_bfe_i32 v19, v19, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NEXT: v_mov_b32_e32 v21, s1
+; GFX8-NEXT: v_mov_b32_e32 v20, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
; GFX8-NEXT: v_bfe_i32 v18, v18, 0, 1
; GFX8-NEXT: v_bfe_i32 v17, v17, 0, 1
; GFX8-NEXT: v_bfe_i32 v16, v16, 0, 1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
; GFX8-NEXT: v_bfe_i32 v15, v15, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v17, s3
+; GFX8-NEXT: v_mov_b32_e32 v17, s1
; GFX8-NEXT: v_bfe_i32 v14, v14, 0, 1
; GFX8-NEXT: v_bfe_i32 v13, v13, 0, 1
; GFX8-NEXT: v_bfe_i32 v12, v12, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v16, s2
+; GFX8-NEXT: v_mov_b32_e32 v16, s0
; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
-; GFX8-NEXT: s_add_u32 s2, s0, 0x70
-; GFX8-NEXT: v_mov_b32_e32 v13, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x70
+; GFX8-NEXT: v_mov_b32_e32 v13, s5
; GFX8-NEXT: v_bfe_i32 v11, v11, 0, 1
; GFX8-NEXT: v_bfe_i32 v10, v10, 0, 1
; GFX8-NEXT: v_bfe_i32 v9, v9, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v8, s5
-; GFX8-NEXT: v_mov_b32_e32 v12, s0
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v8, s3
+; GFX8-NEXT: v_mov_b32_e32 v12, s4
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NEXT: s_add_u32 s0, s0, 0x60
-; GFX8-NEXT: v_mov_b32_e32 v9, s3
; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 1
+; GFX8-NEXT: v_mov_b32_e32 v9, s1
+; GFX8-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x60
; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v8, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GFX8-NEXT: v_bfe_i32 v3, v27, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_bfe_i32 v2, v26, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
@@ -2770,48 +2770,48 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v32i1_to_v32i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2
-; GFX12-NEXT: v_lshrrev_b16 v12, 13, s2
-; GFX12-NEXT: v_lshrrev_b16 v13, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v14, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v9, 9, s2
-; GFX12-NEXT: v_lshrrev_b16 v10, 10, s2
-; GFX12-NEXT: v_lshrrev_b16 v11, 11, s2
-; GFX12-NEXT: s_lshr_b32 s3, s2, 24
-; GFX12-NEXT: v_lshrrev_b16 v4, 4, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v6, 6, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v1, 1, s2
-; GFX12-NEXT: v_lshrrev_b16 v2, 2, s2
-; GFX12-NEXT: v_lshrrev_b16 v3, 3, s2
-; GFX12-NEXT: s_bfe_i32 s4, s2, 0x10018
-; GFX12-NEXT: s_bfe_i32 s5, s2, 0x10000
-; GFX12-NEXT: s_bfe_i32 s6, s2, 0x10013
-; GFX12-NEXT: s_bfe_i32 s7, s2, 0x10012
-; GFX12-NEXT: v_lshrrev_b16 v16, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v20, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 6, s3
-; GFX12-NEXT: v_lshrrev_b16 v22, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v17, 1, s3
-; GFX12-NEXT: v_lshrrev_b16 v18, 2, s3
-; GFX12-NEXT: v_lshrrev_b16 v19, 3, s3
-; GFX12-NEXT: s_bfe_i32 s3, s2, 0x10011
-; GFX12-NEXT: s_bfe_i32 s8, s2, 0x10010
-; GFX12-NEXT: s_bfe_i32 s9, s2, 0x10017
-; GFX12-NEXT: s_bfe_i32 s10, s2, 0x10016
-; GFX12-NEXT: s_bfe_i32 s11, s2, 0x10014
-; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10015
+; GFX12-NEXT: v_lshrrev_b16 v0, 12, s0
+; GFX12-NEXT: v_lshrrev_b16 v12, 13, s0
+; GFX12-NEXT: v_lshrrev_b16 v13, 14, s0
+; GFX12-NEXT: v_lshrrev_b16 v14, 15, s0
+; GFX12-NEXT: v_lshrrev_b16 v8, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v9, 9, s0
+; GFX12-NEXT: v_lshrrev_b16 v10, 10, s0
+; GFX12-NEXT: v_lshrrev_b16 v11, 11, s0
+; GFX12-NEXT: s_lshr_b32 s1, s0, 24
+; GFX12-NEXT: v_lshrrev_b16 v4, 4, s0
+; GFX12-NEXT: v_lshrrev_b16 v5, 5, s0
+; GFX12-NEXT: v_lshrrev_b16 v6, 6, s0
+; GFX12-NEXT: v_lshrrev_b16 v7, 7, s0
+; GFX12-NEXT: v_lshrrev_b16 v1, 1, s0
+; GFX12-NEXT: v_lshrrev_b16 v2, 2, s0
+; GFX12-NEXT: v_lshrrev_b16 v3, 3, s0
+; GFX12-NEXT: s_bfe_i32 s2, s0, 0x10018
+; GFX12-NEXT: s_bfe_i32 s3, s0, 0x10000
+; GFX12-NEXT: s_bfe_i32 s6, s0, 0x10013
+; GFX12-NEXT: s_bfe_i32 s7, s0, 0x10012
+; GFX12-NEXT: v_lshrrev_b16 v16, 4, s1
+; GFX12-NEXT: v_lshrrev_b16 v20, 5, s1
+; GFX12-NEXT: v_lshrrev_b16 v21, 6, s1
+; GFX12-NEXT: v_lshrrev_b16 v22, 7, s1
+; GFX12-NEXT: v_lshrrev_b16 v17, 1, s1
+; GFX12-NEXT: v_lshrrev_b16 v18, 2, s1
+; GFX12-NEXT: v_lshrrev_b16 v19, 3, s1
+; GFX12-NEXT: s_bfe_i32 s1, s0, 0x10011
+; GFX12-NEXT: s_bfe_i32 s8, s0, 0x10010
+; GFX12-NEXT: s_bfe_i32 s9, s0, 0x10017
+; GFX12-NEXT: s_bfe_i32 s10, s0, 0x10016
+; GFX12-NEXT: s_bfe_i32 s11, s0, 0x10014
+; GFX12-NEXT: s_bfe_i32 s0, s0, 0x10015
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v25, s2
+; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v25, s0
; GFX12-NEXT: v_bfe_i32 v15, v14, 0, 1
; GFX12-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_mov_b32 v27, s9
-; GFX12-NEXT: v_dual_mov_b32 v26, s10 :: v_dual_mov_b32 v29, s3
+; GFX12-NEXT: v_dual_mov_b32 v26, s10 :: v_dual_mov_b32 v29, s1
; GFX12-NEXT: v_bfe_i32 v14, v13, 0, 1
; GFX12-NEXT: v_bfe_i32 v13, v12, 0, 1
; GFX12-NEXT: v_bfe_i32 v12, v0, 0, 1
@@ -2828,7 +2828,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v3, v3, 0, 1
; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 1
-; GFX12-NEXT: v_mov_b32_e32 v0, s5
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_bfe_i32 v23, v22, 0, 1
; GFX12-NEXT: v_bfe_i32 v22, v21, 0, 1
; GFX12-NEXT: v_bfe_i32 v21, v20, 0, 1
@@ -2837,16 +2837,16 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v18, v18, 0, 1
; GFX12-NEXT: v_bfe_i32 v17, v17, 0, 1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64
-; GFX12-NEXT: v_mov_b32_e32 v16, s4
+; GFX12-NEXT: global_store_b128 v32, v[24:27], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v32, v[28:31], s[4:5] offset:64
+; GFX12-NEXT: v_mov_b32_e32 v16, s2
; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1]
-; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v32, v[12:15], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v32, v[8:11], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v32, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v32, v[0:3], s[4:5]
+; GFX12-NEXT: global_store_b128 v32, v[20:23], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v32, v[16:19], s[4:5] offset:96
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3025,99 +3025,99 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_zextload_v64i1_to_v64i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshr_b32 s6, s3, 24
-; GFX8-NEXT: s_lshr_b32 s8, s2, 24
-; GFX8-NEXT: s_bfe_u32 s4, s2, 0x10018
-; GFX8-NEXT: s_bfe_u32 s5, s3, 0x10018
-; GFX8-NEXT: s_and_b32 s7, s3, 1
-; GFX8-NEXT: s_and_b32 s9, s2, 1
-; GFX8-NEXT: s_bfe_u32 s12, s2, 0x10013
-; GFX8-NEXT: s_bfe_u32 s13, s2, 0x10012
-; GFX8-NEXT: s_bfe_u32 s14, s2, 0x10011
-; GFX8-NEXT: s_bfe_u32 s15, s2, 0x10010
-; GFX8-NEXT: s_bfe_u32 s16, s2, 0x10017
-; GFX8-NEXT: s_bfe_u32 s17, s2, 0x10016
-; GFX8-NEXT: s_bfe_u32 s18, s2, 0x10015
-; GFX8-NEXT: s_bfe_u32 s19, s2, 0x10014
-; GFX8-NEXT: s_bfe_u32 s20, s3, 0x10013
-; GFX8-NEXT: s_bfe_u32 s21, s3, 0x10012
-; GFX8-NEXT: s_bfe_u32 s22, s3, 0x10011
-; GFX8-NEXT: s_bfe_u32 s23, s3, 0x10010
-; GFX8-NEXT: s_bfe_u32 s10, s3, 0x10017
-; GFX8-NEXT: s_bfe_u32 s11, s3, 0x10016
-; GFX8-NEXT: s_bfe_u32 s24, s3, 0x10015
-; GFX8-NEXT: s_bfe_u32 s25, s3, 0x10014
+; GFX8-NEXT: s_lshr_b32 s6, s1, 24
+; GFX8-NEXT: s_lshr_b32 s8, s0, 24
+; GFX8-NEXT: s_bfe_u32 s2, s0, 0x10018
+; GFX8-NEXT: s_bfe_u32 s3, s1, 0x10018
+; GFX8-NEXT: s_and_b32 s7, s1, 1
+; GFX8-NEXT: s_and_b32 s9, s0, 1
+; GFX8-NEXT: s_bfe_u32 s12, s0, 0x10013
+; GFX8-NEXT: s_bfe_u32 s13, s0, 0x10012
+; GFX8-NEXT: s_bfe_u32 s14, s0, 0x10011
+; GFX8-NEXT: s_bfe_u32 s15, s0, 0x10010
+; GFX8-NEXT: s_bfe_u32 s16, s0, 0x10017
+; GFX8-NEXT: s_bfe_u32 s17, s0, 0x10016
+; GFX8-NEXT: s_bfe_u32 s18, s0, 0x10015
+; GFX8-NEXT: s_bfe_u32 s19, s0, 0x10014
+; GFX8-NEXT: s_bfe_u32 s20, s1, 0x10013
+; GFX8-NEXT: s_bfe_u32 s21, s1, 0x10012
+; GFX8-NEXT: s_bfe_u32 s22, s1, 0x10011
+; GFX8-NEXT: s_bfe_u32 s23, s1, 0x10010
+; GFX8-NEXT: s_bfe_u32 s10, s1, 0x10017
+; GFX8-NEXT: s_bfe_u32 s11, s1, 0x10016
+; GFX8-NEXT: s_bfe_u32 s24, s1, 0x10015
+; GFX8-NEXT: s_bfe_u32 s25, s1, 0x10014
; GFX8-NEXT: v_mov_b32_e32 v25, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 0xd0
+; GFX8-NEXT: s_add_u32 s10, s4, 0xd0
; GFX8-NEXT: v_mov_b32_e32 v24, s11
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v27, s11
; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 0xc0
+; GFX8-NEXT: s_add_u32 s10, s4, 0xc0
; GFX8-NEXT: v_mov_b32_e32 v22, s25
; GFX8-NEXT: v_mov_b32_e32 v23, s24
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s11
; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 0x50
+; GFX8-NEXT: s_add_u32 s10, s4, 0x50
; GFX8-NEXT: v_mov_b32_e32 v22, s23
; GFX8-NEXT: v_mov_b32_e32 v23, s22
; GFX8-NEXT: v_mov_b32_e32 v24, s21
; GFX8-NEXT: v_mov_b32_e32 v25, s20
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s11
; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 64
+; GFX8-NEXT: s_add_u32 s10, s4, 64
; GFX8-NEXT: v_mov_b32_e32 v22, s19
; GFX8-NEXT: v_mov_b32_e32 v23, s18
; GFX8-NEXT: v_mov_b32_e32 v24, s17
; GFX8-NEXT: v_mov_b32_e32 v25, s16
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s11
; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 48
+; GFX8-NEXT: s_add_u32 s10, s4, 48
; GFX8-NEXT: v_mov_b32_e32 v22, s15
; GFX8-NEXT: v_mov_b32_e32 v23, s14
; GFX8-NEXT: v_mov_b32_e32 v24, s13
; GFX8-NEXT: v_mov_b32_e32 v25, s12
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s2
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 12, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v22, 7, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v19, 12, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v22, 7, s1
; GFX8-NEXT: v_mov_b32_e32 v25, s11
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 14, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 14, s0
; GFX8-NEXT: v_and_b32_e32 v21, 1, v0
; GFX8-NEXT: v_and_b32_e32 v27, 1, v22
-; GFX8-NEXT: v_lshrrev_b16_e64 v22, 1, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v22, 1, s1
; GFX8-NEXT: v_mov_b32_e32 v24, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 32
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s2
+; GFX8-NEXT: s_add_u32 s10, s4, 32
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s0
; GFX8-NEXT: v_and_b32_e32 v28, 1, v22
; GFX8-NEXT: v_and_b32_e32 v22, 1, v20
-; GFX8-NEXT: v_lshrrev_b16_e64 v23, 15, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v23, 15, s0
; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v21
; GFX8-NEXT: v_and_b32_e32 v20, 1, v19
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v14, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s2
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v14, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s0
; GFX8-NEXT: v_and_b32_e32 v17, 1, v1
; GFX8-NEXT: v_and_b32_e32 v18, 1, v2
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 5, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 5, s0
; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23]
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 3, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v19, 3, s1
; GFX8-NEXT: v_mov_b32_e32 v25, 1
; GFX8-NEXT: v_mov_b32_e32 v21, s11
; GFX8-NEXT: v_and_b32_e32 v12, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 7, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 7, s0
; GFX8-NEXT: v_and_b32_e32 v23, 1, v19
; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v18
; GFX8-NEXT: v_and_b32_e32 v18, 1, v16
@@ -3129,129 +3129,129 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
; GFX8-NEXT: v_and_b32_e32 v20, 1, v14
; GFX8-NEXT: v_lshrrev_b16_e64 v14, 1, s6
-; GFX8-NEXT: s_add_u32 s10, s0, 16
+; GFX8-NEXT: s_add_u32 s10, s4, 16
; GFX8-NEXT: v_and_b32_e32 v17, 1, v14
; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v15
; GFX8-NEXT: v_lshrrev_b16_e64 v15, 3, s6
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
-; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v13, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 1, s2
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
+; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v13, 6, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 1, s0
; GFX8-NEXT: v_and_b32_e32 v19, 1, v15
; GFX8-NEXT: v_mov_b32_e32 v16, s11
; GFX8-NEXT: v_and_b32_e32 v8, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 3, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 3, s0
; GFX8-NEXT: v_and_b32_e32 v13, 1, v13
; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX8-NEXT: v_and_b32_e32 v11, 1, v11
; GFX8-NEXT: v_mov_b32_e32 v15, s10
-; GFX8-NEXT: v_lshrrev_b16_e64 v9, 2, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v9, 2, s0
; GFX8-NEXT: v_and_b32_e32 v10, 1, v0
; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14]
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s1
; GFX8-NEXT: v_lshrrev_b16_e64 v11, 5, s8
-; GFX8-NEXT: v_mov_b32_e32 v13, s1
+; GFX8-NEXT: v_mov_b32_e32 v13, s5
; GFX8-NEXT: v_and_b32_e32 v15, 1, v11
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v10
; GFX8-NEXT: v_and_b32_e32 v10, 1, v9
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v8
; GFX8-NEXT: v_mov_b32_e32 v8, s9
-; GFX8-NEXT: v_mov_b32_e32 v12, s0
+; GFX8-NEXT: v_mov_b32_e32 v12, s4
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NEXT: s_add_u32 s10, s0, 0xb0
+; GFX8-NEXT: s_add_u32 s10, s4, 0xb0
; GFX8-NEXT: v_lshrrev_b16_e64 v8, 1, s8
-; GFX8-NEXT: v_lshrrev_b16_e64 v5, 12, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v5, 12, s1
; GFX8-NEXT: v_and_b32_e32 v6, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v7, 14, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 9, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v7, 14, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 9, s1
; GFX8-NEXT: v_and_b32_e32 v11, 1, v8
; GFX8-NEXT: v_lshrrev_b16_e64 v8, 3, s8
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v9, s10
; GFX8-NEXT: v_and_b32_e32 v2, 1, v0
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 11, s1
; GFX8-NEXT: v_and_b32_e32 v13, 1, v8
; GFX8-NEXT: v_and_b32_e32 v7, 1, v7
-; GFX8-NEXT: v_lshrrev_b16_e64 v8, 15, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v8, 15, s1
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX8-NEXT: v_and_b32_e32 v5, 1, v5
; GFX8-NEXT: v_mov_b32_e32 v10, s11
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 10, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 11, s3
+; GFX8-NEXT: s_add_u32 s0, s4, 0xa0
+; GFX8-NEXT: v_lshrrev_b16_e64 v3, 10, s1
+; GFX8-NEXT: v_and_b32_e32 v4, 1, v0
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 4, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v26, 6, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v22, 2, s1
; GFX8-NEXT: v_lshrrev_b16_e64 v18, 2, s6
; GFX8-NEXT: v_lshrrev_b16_e64 v14, 4, s8
; GFX8-NEXT: v_lshrrev_b16_e64 v16, 6, s8
; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[5:8]
-; GFX8-NEXT: s_add_u32 s2, s0, 0xa0
+; GFX8-NEXT: v_and_b32_e32 v10, 1, v16
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v13
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v17
; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v2
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 4, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v26, 6, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v22, 2, s3
-; GFX8-NEXT: v_and_b32_e32 v10, 1, v16
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_and_b32_e32 v8, 1, v14
; GFX8-NEXT: v_and_b32_e32 v14, 1, v18
; GFX8-NEXT: v_and_b32_e32 v18, 1, v3
; GFX8-NEXT: v_and_b32_sdwa v16, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v15
; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v19
; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v4
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x90
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x90
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19]
; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v1
; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v20
; GFX8-NEXT: v_and_b32_e32 v20, 1, v0
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_lshrrev_b16_e64 v4, 6, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x80
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x80
; GFX8-NEXT: v_and_b32_e32 v18, 1, v4
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v23
; GFX8-NEXT: v_and_b32_e32 v3, 1, v22
; GFX8-NEXT: v_and_b32_e32 v23, 0xffff, v27
; GFX8-NEXT: v_and_b32_e32 v22, 1, v26
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v28
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
+; GFX8-NEXT: v_mov_b32_e32 v21, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0xf0
+; GFX8-NEXT: v_mov_b32_e32 v20, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xf0
; GFX8-NEXT: v_lshrrev_b16_e64 v24, 4, s6
; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[1:4]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_lshrrev_b16_e64 v19, 7, s6
; GFX8-NEXT: v_and_b32_e32 v16, 1, v24
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0xe0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0xe0
; GFX8-NEXT: v_lshrrev_b16_e64 v12, 2, s8
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_and_b32_e32 v6, 1, v12
-; GFX8-NEXT: v_mov_b32_e32 v12, s5
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x70
+; GFX8-NEXT: v_mov_b32_e32 v12, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x70
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v11
; GFX8-NEXT: v_lshrrev_b16_e64 v11, 7, s8
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s0, s0, 0x60
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x60
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NEXT: s_endpgm
@@ -3444,113 +3444,113 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v64i1_to_v64i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v2, 13, s2
-; GFX12-NEXT: s_lshr_b32 s4, s3, 24
+; GFX12-NEXT: v_lshrrev_b16 v2, 13, s0
+; GFX12-NEXT: s_lshr_b32 s2, s1, 24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: v_lshrrev_b16 v10, 13, s3
-; GFX12-NEXT: v_lshrrev_b16 v3, 9, s2
+; GFX12-NEXT: v_lshrrev_b16 v10, 13, s1
+; GFX12-NEXT: v_lshrrev_b16 v3, 9, s0
; GFX12-NEXT: v_and_b32_e32 v45, 1, v2
-; GFX12-NEXT: v_lshrrev_b16 v2, 1, s4
-; GFX12-NEXT: v_lshrrev_b16 v4, 11, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 1, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 3, s2
-; GFX12-NEXT: v_lshrrev_b16 v12, 11, s3
-; GFX12-NEXT: v_lshrrev_b16 v14, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v18, 5, s4
-; GFX12-NEXT: s_lshr_b32 s5, s2, 24
-; GFX12-NEXT: s_and_b32 s6, s3, 1
-; GFX12-NEXT: s_bfe_u32 s14, s3, 0x10012
+; GFX12-NEXT: v_lshrrev_b16 v2, 1, s2
+; GFX12-NEXT: v_lshrrev_b16 v4, 11, s0
+; GFX12-NEXT: v_lshrrev_b16 v5, 5, s0
+; GFX12-NEXT: v_lshrrev_b16 v7, 1, s0
+; GFX12-NEXT: v_lshrrev_b16 v8, 3, s0
+; GFX12-NEXT: v_lshrrev_b16 v12, 11, s1
+; GFX12-NEXT: v_lshrrev_b16 v14, 7, s1
+; GFX12-NEXT: v_lshrrev_b16 v18, 5, s2
+; GFX12-NEXT: s_lshr_b32 s3, s0, 24
+; GFX12-NEXT: s_and_b32 s6, s1, 1
+; GFX12-NEXT: s_bfe_u32 s14, s1, 0x10012
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v55, s14 :: v_dual_and_b32 v36, 1, v10
; GFX12-NEXT: v_and_b32_e32 v10, 1, v2
-; GFX12-NEXT: v_lshrrev_b16 v2, 3, s4
-; GFX12-NEXT: v_lshrrev_b16 v6, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v11, 9, s3
-; GFX12-NEXT: v_lshrrev_b16 v13, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v15, 1, s3
-; GFX12-NEXT: v_lshrrev_b16 v16, 3, s3
+; GFX12-NEXT: v_lshrrev_b16 v2, 3, s2
+; GFX12-NEXT: v_lshrrev_b16 v6, 7, s0
+; GFX12-NEXT: v_lshrrev_b16 v11, 9, s1
+; GFX12-NEXT: v_lshrrev_b16 v13, 5, s1
+; GFX12-NEXT: v_lshrrev_b16 v15, 1, s1
+; GFX12-NEXT: v_lshrrev_b16 v16, 3, s1
; GFX12-NEXT: v_and_b32_e32 v43, 1, v4
-; GFX12-NEXT: v_lshrrev_b16 v4, 3, s5
-; GFX12-NEXT: s_bfe_u32 s19, s3, 0x10014
+; GFX12-NEXT: v_lshrrev_b16 v4, 3, s3
+; GFX12-NEXT: s_bfe_u32 s19, s1, 0x10014
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v49, s19 :: v_dual_and_b32 v42, 1, v3
-; GFX12-NEXT: v_lshrrev_b16 v3, 5, s5
-; GFX12-NEXT: s_bfe_u32 s13, s3, 0x10013
-; GFX12-NEXT: v_lshrrev_b16 v29, 12, s3
-; GFX12-NEXT: v_lshrrev_b16 v30, 14, s3
-; GFX12-NEXT: v_lshrrev_b16 v31, 15, s3
-; GFX12-NEXT: v_lshrrev_b16 v25, 8, s3
-; GFX12-NEXT: v_lshrrev_b16 v26, 10, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v22, 6, s3
+; GFX12-NEXT: v_lshrrev_b16 v3, 5, s3
+; GFX12-NEXT: s_bfe_u32 s13, s1, 0x10013
+; GFX12-NEXT: v_lshrrev_b16 v29, 12, s1
+; GFX12-NEXT: v_lshrrev_b16 v30, 14, s1
+; GFX12-NEXT: v_lshrrev_b16 v31, 15, s1
+; GFX12-NEXT: v_lshrrev_b16 v25, 8, s1
+; GFX12-NEXT: v_lshrrev_b16 v26, 10, s1
+; GFX12-NEXT: v_lshrrev_b16 v21, 4, s1
+; GFX12-NEXT: v_lshrrev_b16 v22, 6, s1
; GFX12-NEXT: v_dual_mov_b32 v56, s13 :: v_dual_and_b32 v27, 1, v12
-; GFX12-NEXT: v_lshrrev_b16 v19, 2, s3
+; GFX12-NEXT: v_lshrrev_b16 v19, 2, s1
; GFX12-NEXT: v_and_b32_e32 v12, 1, v2
-; GFX12-NEXT: v_lshrrev_b16 v2, 1, s5
-; GFX12-NEXT: s_and_b32 s7, s2, 1
-; GFX12-NEXT: s_bfe_u32 s15, s3, 0x10011
+; GFX12-NEXT: v_lshrrev_b16 v2, 1, s3
+; GFX12-NEXT: s_and_b32 s7, s0, 1
+; GFX12-NEXT: s_bfe_u32 s15, s1, 0x10011
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v54, s15 :: v_dual_and_b32 v35, 1, v8
-; GFX12-NEXT: v_lshrrev_b16 v8, 7, s5
-; GFX12-NEXT: s_bfe_u32 s16, s3, 0x10010
+; GFX12-NEXT: v_lshrrev_b16 v8, 7, s3
+; GFX12-NEXT: s_bfe_u32 s16, s1, 0x10010
; GFX12-NEXT: v_dual_mov_b32 v53, s16 :: v_dual_and_b32 v40, 1, v7
-; GFX12-NEXT: v_lshrrev_b16 v7, 2, s5
-; GFX12-NEXT: s_bfe_u32 s17, s3, 0x10017
-; GFX12-NEXT: s_bfe_u32 s18, s3, 0x10016
+; GFX12-NEXT: v_lshrrev_b16 v7, 2, s3
+; GFX12-NEXT: s_bfe_u32 s17, s1, 0x10017
+; GFX12-NEXT: s_bfe_u32 s18, s1, 0x10016
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v51, s18 :: v_dual_and_b32 v44, 1, v5
-; GFX12-NEXT: v_lshrrev_b16 v5, 4, s5
-; GFX12-NEXT: s_bfe_u32 s13, s2, 0x10015
+; GFX12-NEXT: v_lshrrev_b16 v5, 4, s3
+; GFX12-NEXT: s_bfe_u32 s13, s0, 0x10015
; GFX12-NEXT: v_and_b32_e32 v23, 1, v14
; GFX12-NEXT: v_and_b32_e32 v14, 1, v18
-; GFX12-NEXT: v_lshrrev_b16 v18, 6, s5
-; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10018
-; GFX12-NEXT: s_bfe_u32 s3, s3, 0x10015
+; GFX12-NEXT: v_lshrrev_b16 v18, 6, s3
+; GFX12-NEXT: s_bfe_u32 s3, s1, 0x10018
+; GFX12-NEXT: s_bfe_u32 s1, s1, 0x10015
; GFX12-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_and_b32 v39, 1, v6
; GFX12-NEXT: v_and_b32_e32 v32, 1, v11
-; GFX12-NEXT: v_lshrrev_b16 v11, 2, s4
-; GFX12-NEXT: s_bfe_u32 s9, s2, 0x10012
+; GFX12-NEXT: v_lshrrev_b16 v11, 2, s2
+; GFX12-NEXT: s_bfe_u32 s9, s0, 0x10012
; GFX12-NEXT: v_and_b32_e32 v20, 1, v16
-; GFX12-NEXT: v_lshrrev_b16 v16, 7, s4
-; GFX12-NEXT: s_bfe_u32 s11, s2, 0x10010
+; GFX12-NEXT: v_lshrrev_b16 v16, 7, s2
+; GFX12-NEXT: s_bfe_u32 s11, s0, 0x10010
; GFX12-NEXT: v_and_b32_e32 v24, 1, v15
-; GFX12-NEXT: v_lshrrev_b16 v15, 6, s4
-; GFX12-NEXT: s_bfe_u32 s12, s2, 0x10017
-; GFX12-NEXT: v_mov_b32_e32 v50, s3
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10016
+; GFX12-NEXT: v_lshrrev_b16 v15, 6, s2
+; GFX12-NEXT: s_bfe_u32 s12, s0, 0x10017
+; GFX12-NEXT: v_mov_b32_e32 v50, s1
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10016
; GFX12-NEXT: v_and_b32_e32 v28, 1, v13
-; GFX12-NEXT: v_lshrrev_b16 v13, 4, s4
-; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018
+; GFX12-NEXT: v_lshrrev_b16 v13, 4, s2
+; GFX12-NEXT: s_bfe_u32 s2, s0, 0x10018
; GFX12-NEXT: v_and_b32_e32 v6, 1, v3
; GFX12-NEXT: v_and_b32_e32 v3, 1, v4
-; GFX12-NEXT: v_lshrrev_b16 v17, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v1, 12, s2
-; GFX12-NEXT: v_lshrrev_b16 v9, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v33, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v41, 10, s2
-; GFX12-NEXT: v_lshrrev_b16 v38, 6, s2
-; GFX12-NEXT: v_lshrrev_b16 v34, 2, s2
-; GFX12-NEXT: v_lshrrev_b16 v37, 4, s2
-; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10013
-; GFX12-NEXT: s_bfe_u32 s10, s2, 0x10011
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10014
+; GFX12-NEXT: v_lshrrev_b16 v17, 15, s0
+; GFX12-NEXT: v_lshrrev_b16 v1, 12, s0
+; GFX12-NEXT: v_lshrrev_b16 v9, 14, s0
+; GFX12-NEXT: v_lshrrev_b16 v33, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v41, 10, s0
+; GFX12-NEXT: v_lshrrev_b16 v38, 6, s0
+; GFX12-NEXT: v_lshrrev_b16 v34, 2, s0
+; GFX12-NEXT: v_lshrrev_b16 v37, 4, s0
+; GFX12-NEXT: s_bfe_u32 s8, s0, 0x10013
+; GFX12-NEXT: s_bfe_u32 s10, s0, 0x10011
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x10014
; GFX12-NEXT: v_and_b32_e32 v2, 1, v2
; GFX12-NEXT: v_and_b32_e32 v21, 1, v21
; GFX12-NEXT: v_and_b32_e32 v29, 1, v29
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v0, v[49:52], s[0:1] offset:208
-; GFX12-NEXT: global_store_b128 v0, v[53:56], s[0:1] offset:192
+; GFX12-NEXT: global_store_b128 v0, v[49:52], s[4:5] offset:208
+; GFX12-NEXT: global_store_b128 v0, v[53:56], s[4:5] offset:192
; GFX12-NEXT: v_mov_b32_e32 v52, s12
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v3
; GFX12-NEXT: v_dual_mov_b32 v54, s10 :: v_dual_and_b32 v3, 1, v7
; GFX12-NEXT: v_dual_mov_b32 v56, s8 :: v_dual_and_b32 v7, 1, v18
-; GFX12-NEXT: v_dual_mov_b32 v49, s2 :: v_dual_mov_b32 v50, s13
-; GFX12-NEXT: v_mov_b32_e32 v51, s3
+; GFX12-NEXT: v_dual_mov_b32 v49, s0 :: v_dual_mov_b32 v50, s13
+; GFX12-NEXT: v_mov_b32_e32 v51, s1
; GFX12-NEXT: v_dual_mov_b32 v53, s11 :: v_dual_and_b32 v18, 0xffff, v24
; GFX12-NEXT: v_and_b32_e32 v24, 0xffff, v23
; GFX12-NEXT: v_and_b32_e32 v23, 1, v22
@@ -3583,28 +3583,28 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v33, s7 :: v_dual_and_b32 v14, 0xffff, v14
; GFX12-NEXT: v_and_b32_e32 v11, 1, v11
; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX12-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_and_b32 v42, 0xffff, v42
+; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_and_b32 v42, 0xffff, v42
; GFX12-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX12-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX12-NEXT: v_and_b32_e32 v5, 1, v5
; GFX12-NEXT: v_and_b32_e32 v37, 1, v37
-; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v6, 0xffff, v6
+; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v6, 0xffff, v6
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: s_clause 0xd
-; GFX12-NEXT: global_store_b128 v0, v[49:52], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v0, v[53:56], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v0, v[45:48], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v0, v[41:44], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v0, v[37:40], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v0, v[33:36], s[0:1]
-; GFX12-NEXT: global_store_b128 v0, v[29:32], s[0:1] offset:176
-; GFX12-NEXT: global_store_b128 v0, v[25:28], s[0:1] offset:160
-; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:144
-; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:128
-; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:240
-; GFX12-NEXT: global_store_b128 v0, v[9:12], s[0:1] offset:224
-; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v0, v[49:52], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v0, v[53:56], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v0, v[45:48], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v0, v[41:44], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v0, v[37:40], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v0, v[33:36], s[4:5]
+; GFX12-NEXT: global_store_b128 v0, v[29:32], s[4:5] offset:176
+; GFX12-NEXT: global_store_b128 v0, v[25:28], s[4:5] offset:160
+; GFX12-NEXT: global_store_b128 v0, v[21:24], s[4:5] offset:144
+; GFX12-NEXT: global_store_b128 v0, v[17:20], s[4:5] offset:128
+; GFX12-NEXT: global_store_b128 v0, v[13:16], s[4:5] offset:240
+; GFX12-NEXT: global_store_b128 v0, v[9:12], s[4:5] offset:224
+; GFX12-NEXT: global_store_b128 v0, v[5:8], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v0, v[1:4], s[4:5] offset:96
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3783,84 +3783,84 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_sextload_v64i1_to_v64i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b16_e64 v18, 12, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 13, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 14, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v21, 15, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 8, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v15, 9, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v17, 11, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v10, 4, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v11, 5, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v12, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v13, 7, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v7, 1, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v8, 2, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v9, 3, s2
-; GFX8-NEXT: s_lshr_b32 s7, s3, 24
-; GFX8-NEXT: s_lshr_b32 s8, s2, 24
-; GFX8-NEXT: s_bfe_i32 s4, s2, 0x10018
-; GFX8-NEXT: s_bfe_i32 s5, s3, 0x10018
-; GFX8-NEXT: s_bfe_i32 s6, s3, 0x10000
-; GFX8-NEXT: s_bfe_i32 s9, s2, 0x10000
-; GFX8-NEXT: s_bfe_i32 s12, s2, 0x10013
-; GFX8-NEXT: s_bfe_i32 s13, s2, 0x10012
-; GFX8-NEXT: s_bfe_i32 s14, s2, 0x10011
-; GFX8-NEXT: s_bfe_i32 s15, s2, 0x10010
-; GFX8-NEXT: s_bfe_i32 s16, s2, 0x10017
-; GFX8-NEXT: s_bfe_i32 s17, s2, 0x10016
-; GFX8-NEXT: s_bfe_i32 s18, s2, 0x10015
-; GFX8-NEXT: s_bfe_i32 s2, s2, 0x10014
-; GFX8-NEXT: s_bfe_i32 s19, s3, 0x10013
-; GFX8-NEXT: s_bfe_i32 s20, s3, 0x10012
-; GFX8-NEXT: s_bfe_i32 s21, s3, 0x10011
-; GFX8-NEXT: s_bfe_i32 s22, s3, 0x10010
-; GFX8-NEXT: s_bfe_i32 s10, s3, 0x10017
-; GFX8-NEXT: s_bfe_i32 s11, s3, 0x10016
-; GFX8-NEXT: s_bfe_i32 s23, s3, 0x10015
-; GFX8-NEXT: s_bfe_i32 s24, s3, 0x10014
+; GFX8-NEXT: v_lshrrev_b16_e64 v18, 12, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v19, 13, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 14, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v21, 15, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v14, 8, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v15, 9, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v17, 11, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v10, 4, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v11, 5, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v12, 6, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v13, 7, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v7, 1, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v8, 2, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v9, 3, s0
+; GFX8-NEXT: s_lshr_b32 s7, s1, 24
+; GFX8-NEXT: s_lshr_b32 s8, s0, 24
+; GFX8-NEXT: s_bfe_i32 s2, s0, 0x10018
+; GFX8-NEXT: s_bfe_i32 s3, s1, 0x10018
+; GFX8-NEXT: s_bfe_i32 s6, s1, 0x10000
+; GFX8-NEXT: s_bfe_i32 s9, s0, 0x10000
+; GFX8-NEXT: s_bfe_i32 s12, s0, 0x10013
+; GFX8-NEXT: s_bfe_i32 s13, s0, 0x10012
+; GFX8-NEXT: s_bfe_i32 s14, s0, 0x10011
+; GFX8-NEXT: s_bfe_i32 s15, s0, 0x10010
+; GFX8-NEXT: s_bfe_i32 s16, s0, 0x10017
+; GFX8-NEXT: s_bfe_i32 s17, s0, 0x10016
+; GFX8-NEXT: s_bfe_i32 s18, s0, 0x10015
+; GFX8-NEXT: s_bfe_i32 s0, s0, 0x10014
+; GFX8-NEXT: s_bfe_i32 s19, s1, 0x10013
+; GFX8-NEXT: s_bfe_i32 s20, s1, 0x10012
+; GFX8-NEXT: s_bfe_i32 s21, s1, 0x10011
+; GFX8-NEXT: s_bfe_i32 s22, s1, 0x10010
+; GFX8-NEXT: s_bfe_i32 s10, s1, 0x10017
+; GFX8-NEXT: s_bfe_i32 s11, s1, 0x10016
+; GFX8-NEXT: s_bfe_i32 s23, s1, 0x10015
+; GFX8-NEXT: s_bfe_i32 s24, s1, 0x10014
; GFX8-NEXT: v_mov_b32_e32 v25, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 0xd0
+; GFX8-NEXT: s_add_u32 s10, s4, 0xd0
; GFX8-NEXT: v_mov_b32_e32 v24, s11
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v27, s11
; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 0xc0
+; GFX8-NEXT: s_add_u32 s10, s4, 0xc0
; GFX8-NEXT: v_mov_b32_e32 v22, s24
; GFX8-NEXT: v_mov_b32_e32 v23, s23
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s11
; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 0x50
+; GFX8-NEXT: s_add_u32 s10, s4, 0x50
; GFX8-NEXT: v_mov_b32_e32 v22, s22
; GFX8-NEXT: v_mov_b32_e32 v23, s21
; GFX8-NEXT: v_mov_b32_e32 v24, s20
; GFX8-NEXT: v_mov_b32_e32 v25, s19
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s11
; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 64
-; GFX8-NEXT: v_mov_b32_e32 v22, s2
+; GFX8-NEXT: s_add_u32 s10, s4, 64
+; GFX8-NEXT: v_mov_b32_e32 v22, s0
; GFX8-NEXT: v_mov_b32_e32 v23, s18
; GFX8-NEXT: v_mov_b32_e32 v24, s17
; GFX8-NEXT: v_mov_b32_e32 v25, s16
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s11
; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 48
+; GFX8-NEXT: s_add_u32 s10, s4, 48
; GFX8-NEXT: v_mov_b32_e32 v22, s15
; GFX8-NEXT: v_mov_b32_e32 v23, s14
; GFX8-NEXT: v_mov_b32_e32 v24, s13
; GFX8-NEXT: v_mov_b32_e32 v25, s12
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_bfe_i32 v21, v21, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v23, s11
@@ -3868,58 +3868,58 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX8-NEXT: v_bfe_i32 v19, v19, 0, 1
; GFX8-NEXT: v_bfe_i32 v18, v18, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v22, s10
-; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 12, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 13, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v5, 14, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v6, 15, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 10, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v24, 11, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v26, 4, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v27, 5, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v28, 6, s3
+; GFX8-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NEXT: v_lshrrev_b16_e64 v3, 12, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v4, 13, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v5, 14, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v6, 15, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 10, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v24, 11, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v26, 4, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v27, 5, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v28, 6, s1
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21]
-; GFX8-NEXT: v_lshrrev_b16_e64 v22, 7, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v23, 1, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 2, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v21, 3, s3
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v19, s3
-; GFX8-NEXT: v_mov_b32_e32 v18, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NEXT: v_lshrrev_b16_e64 v22, 7, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v23, 1, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 2, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v21, 3, s1
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v19, s1
+; GFX8-NEXT: v_mov_b32_e32 v18, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
; GFX8-NEXT: v_bfe_i32 v17, v17, 0, 1
; GFX8-NEXT: v_bfe_i32 v16, v16, 0, 1
; GFX8-NEXT: v_bfe_i32 v15, v15, 0, 1
; GFX8-NEXT: v_bfe_i32 v14, v14, 0, 1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[14:17]
; GFX8-NEXT: v_bfe_i32 v13, v13, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v15, s3
+; GFX8-NEXT: v_mov_b32_e32 v15, s1
; GFX8-NEXT: v_bfe_i32 v12, v12, 0, 1
; GFX8-NEXT: v_bfe_i32 v11, v11, 0, 1
; GFX8-NEXT: v_bfe_i32 v10, v10, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v14, s2
+; GFX8-NEXT: v_mov_b32_e32 v14, s0
; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[10:13]
-; GFX8-NEXT: s_add_u32 s2, s0, 0xb0
-; GFX8-NEXT: v_mov_b32_e32 v12, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0xb0
+; GFX8-NEXT: v_mov_b32_e32 v12, s5
; GFX8-NEXT: v_bfe_i32 v10, v9, 0, 1
; GFX8-NEXT: v_bfe_i32 v9, v8, 0, 1
; GFX8-NEXT: v_bfe_i32 v8, v7, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v7, s9
-; GFX8-NEXT: v_mov_b32_e32 v11, s0
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v11, s4
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[11:12], v[7:10]
; GFX8-NEXT: v_lshrrev_b16_e64 v11, 6, s8
-; GFX8-NEXT: v_mov_b32_e32 v8, s3
+; GFX8-NEXT: v_mov_b32_e32 v8, s1
; GFX8-NEXT: v_lshrrev_b16_e64 v10, 5, s8
; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 1
; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 1
; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 1
; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v7, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0xa0
+; GFX8-NEXT: v_mov_b32_e32 v7, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xa0
; GFX8-NEXT: v_lshrrev_b16_e64 v13, 4, s8
; GFX8-NEXT: v_lshrrev_b16_e64 v12, 1, s8
; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[3:6]
@@ -3929,21 +3929,21 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX8-NEXT: v_bfe_i32 v7, v10, 0, 1
; GFX8-NEXT: v_bfe_i32 v11, v1, 0, 1
; GFX8-NEXT: v_bfe_i32 v10, v0, 0, 1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 1
; GFX8-NEXT: v_bfe_i32 v4, v3, 0, 1
; GFX8-NEXT: v_bfe_i32 v3, v12, 0, 1
; GFX8-NEXT: v_bfe_i32 v6, v13, 0, 1
; GFX8-NEXT: v_bfe_i32 v13, v24, 0, 1
; GFX8-NEXT: v_bfe_i32 v12, v2, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x90
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x90
; GFX8-NEXT: v_lshrrev_b16_e64 v19, 5, s7
; GFX8-NEXT: v_lshrrev_b16_e64 v15, 2, s7
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[10:13]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_bfe_i32 v12, v15, 0, 1
; GFX8-NEXT: v_bfe_i32 v15, v19, 0, 1
; GFX8-NEXT: v_bfe_i32 v19, v23, 0, 1
@@ -3951,48 +3951,48 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX8-NEXT: v_bfe_i32 v24, v28, 0, 1
; GFX8-NEXT: v_bfe_i32 v23, v27, 0, 1
; GFX8-NEXT: v_bfe_i32 v22, v26, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x80
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x80
; GFX8-NEXT: v_lshrrev_b16_e64 v18, 4, s7
; GFX8-NEXT: v_lshrrev_b16_e64 v14, 1, s7
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[22:25]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_bfe_i32 v11, v14, 0, 1
; GFX8-NEXT: v_bfe_i32 v14, v18, 0, 1
; GFX8-NEXT: v_bfe_i32 v21, v21, 0, 1
; GFX8-NEXT: v_bfe_i32 v20, v20, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v18, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0xf0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0xf0
; GFX8-NEXT: v_lshrrev_b16_e64 v16, 6, s7
; GFX8-NEXT: v_lshrrev_b16_e64 v17, 7, s7
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[18:21]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_bfe_i32 v17, v17, 0, 1
; GFX8-NEXT: v_bfe_i32 v16, v16, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0xe0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0xe0
; GFX8-NEXT: v_lshrrev_b16_e64 v2, 3, s7
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[14:17]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_bfe_i32 v13, v2, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v10, s5
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x70
+; GFX8-NEXT: v_mov_b32_e32 v10, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x70
; GFX8-NEXT: v_lshrrev_b16_e64 v9, 7, s8
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[10:13]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_bfe_i32 v9, v9, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s0, s0, 0x60
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x60
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[6:9]
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GFX8-NEXT: s_endpgm
@@ -4244,82 +4244,82 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v64i1_to_v64i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s5, s2, 24
-; GFX12-NEXT: v_lshrrev_b16 v28, 12, s3
-; GFX12-NEXT: v_lshrrev_b16 v29, 13, s3
-; GFX12-NEXT: v_lshrrev_b16 v30, 14, s3
-; GFX12-NEXT: v_lshrrev_b16 v31, 15, s3
-; GFX12-NEXT: v_lshrrev_b16 v24, 8, s3
-; GFX12-NEXT: v_lshrrev_b16 v25, 9, s3
-; GFX12-NEXT: v_lshrrev_b16 v26, 10, s3
-; GFX12-NEXT: v_lshrrev_b16 v27, 11, s3
-; GFX12-NEXT: v_lshrrev_b16 v20, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v22, 6, s3
-; GFX12-NEXT: v_lshrrev_b16 v23, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v17, 1, s3
-; GFX12-NEXT: v_lshrrev_b16 v18, 2, s3
-; GFX12-NEXT: v_lshrrev_b16 v19, 3, s3
-; GFX12-NEXT: s_lshr_b32 s4, s3, 24
-; GFX12-NEXT: v_lshrrev_b16 v4, 4, s5
-; GFX12-NEXT: v_lshrrev_b16 v5, 5, s5
-; GFX12-NEXT: v_lshrrev_b16 v6, 6, s5
-; GFX12-NEXT: v_lshrrev_b16 v1, 3, s5
-; GFX12-NEXT: v_lshrrev_b16 v2, 2, s5
-; GFX12-NEXT: v_lshrrev_b16 v7, 1, s5
-; GFX12-NEXT: v_lshrrev_b16 v44, 7, s5
-; GFX12-NEXT: s_bfe_i32 s5, s3, 0x10018
-; GFX12-NEXT: s_bfe_i32 s6, s3, 0x10000
-; GFX12-NEXT: s_bfe_i32 s13, s3, 0x10013
-; GFX12-NEXT: s_bfe_i32 s14, s3, 0x10012
-; GFX12-NEXT: s_bfe_i32 s15, s3, 0x10011
-; GFX12-NEXT: s_bfe_i32 s16, s3, 0x10010
-; GFX12-NEXT: s_bfe_i32 s17, s3, 0x10017
-; GFX12-NEXT: s_bfe_i32 s18, s3, 0x10016
-; GFX12-NEXT: s_bfe_i32 s19, s3, 0x10014
-; GFX12-NEXT: s_bfe_i32 s3, s3, 0x10015
+; GFX12-NEXT: s_lshr_b32 s3, s0, 24
+; GFX12-NEXT: v_lshrrev_b16 v28, 12, s1
+; GFX12-NEXT: v_lshrrev_b16 v29, 13, s1
+; GFX12-NEXT: v_lshrrev_b16 v30, 14, s1
+; GFX12-NEXT: v_lshrrev_b16 v31, 15, s1
+; GFX12-NEXT: v_lshrrev_b16 v24, 8, s1
+; GFX12-NEXT: v_lshrrev_b16 v25, 9, s1
+; GFX12-NEXT: v_lshrrev_b16 v26, 10, s1
+; GFX12-NEXT: v_lshrrev_b16 v27, 11, s1
+; GFX12-NEXT: v_lshrrev_b16 v20, 4, s1
+; GFX12-NEXT: v_lshrrev_b16 v21, 5, s1
+; GFX12-NEXT: v_lshrrev_b16 v22, 6, s1
+; GFX12-NEXT: v_lshrrev_b16 v23, 7, s1
+; GFX12-NEXT: v_lshrrev_b16 v17, 1, s1
+; GFX12-NEXT: v_lshrrev_b16 v18, 2, s1
+; GFX12-NEXT: v_lshrrev_b16 v19, 3, s1
+; GFX12-NEXT: s_lshr_b32 s2, s1, 24
+; GFX12-NEXT: v_lshrrev_b16 v4, 4, s3
+; GFX12-NEXT: v_lshrrev_b16 v5, 5, s3
+; GFX12-NEXT: v_lshrrev_b16 v6, 6, s3
+; GFX12-NEXT: v_lshrrev_b16 v1, 3, s3
+; GFX12-NEXT: v_lshrrev_b16 v2, 2, s3
+; GFX12-NEXT: v_lshrrev_b16 v7, 1, s3
+; GFX12-NEXT: v_lshrrev_b16 v44, 7, s3
+; GFX12-NEXT: s_bfe_i32 s3, s1, 0x10018
+; GFX12-NEXT: s_bfe_i32 s6, s1, 0x10000
+; GFX12-NEXT: s_bfe_i32 s13, s1, 0x10013
+; GFX12-NEXT: s_bfe_i32 s14, s1, 0x10012
+; GFX12-NEXT: s_bfe_i32 s15, s1, 0x10011
+; GFX12-NEXT: s_bfe_i32 s16, s1, 0x10010
+; GFX12-NEXT: s_bfe_i32 s17, s1, 0x10017
+; GFX12-NEXT: s_bfe_i32 s18, s1, 0x10016
+; GFX12-NEXT: s_bfe_i32 s19, s1, 0x10014
+; GFX12-NEXT: s_bfe_i32 s1, s1, 0x10015
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v56, 0 :: v_dual_mov_b32 v49, s3
+; GFX12-NEXT: v_dual_mov_b32 v56, 0 :: v_dual_mov_b32 v49, s1
; GFX12-NEXT: v_dual_mov_b32 v48, s19 :: v_dual_mov_b32 v51, s17
; GFX12-NEXT: v_dual_mov_b32 v50, s18 :: v_dual_mov_b32 v53, s15
-; GFX12-NEXT: v_lshrrev_b16 v16, 14, s2
+; GFX12-NEXT: v_lshrrev_b16 v16, 14, s0
; GFX12-NEXT: v_dual_mov_b32 v52, s16 :: v_dual_mov_b32 v55, s13
-; GFX12-NEXT: s_bfe_i32 s13, s2, 0x10015
+; GFX12-NEXT: s_bfe_i32 s13, s0, 0x10015
; GFX12-NEXT: v_mov_b32_e32 v54, s14
-; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 13, s2
-; GFX12-NEXT: v_lshrrev_b16 v32, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v12, 4, s4
-; GFX12-NEXT: v_lshrrev_b16 v13, 5, s4
-; GFX12-NEXT: v_lshrrev_b16 v14, 6, s4
-; GFX12-NEXT: v_lshrrev_b16 v15, 7, s4
-; GFX12-NEXT: v_lshrrev_b16 v40, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v41, 9, s2
-; GFX12-NEXT: v_lshrrev_b16 v42, 10, s2
-; GFX12-NEXT: v_lshrrev_b16 v43, 11, s2
-; GFX12-NEXT: v_lshrrev_b16 v36, 4, s2
-; GFX12-NEXT: v_lshrrev_b16 v37, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v38, 6, s2
-; GFX12-NEXT: v_lshrrev_b16 v39, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v33, 1, s2
-; GFX12-NEXT: v_lshrrev_b16 v34, 2, s2
-; GFX12-NEXT: v_lshrrev_b16 v35, 3, s2
-; GFX12-NEXT: v_lshrrev_b16 v9, 1, s4
-; GFX12-NEXT: v_lshrrev_b16 v10, 2, s4
-; GFX12-NEXT: v_lshrrev_b16 v11, 3, s4
-; GFX12-NEXT: s_bfe_i32 s4, s2, 0x10018
-; GFX12-NEXT: s_bfe_i32 s7, s2, 0x10000
-; GFX12-NEXT: s_bfe_i32 s8, s2, 0x10013
-; GFX12-NEXT: s_bfe_i32 s9, s2, 0x10012
-; GFX12-NEXT: s_bfe_i32 s10, s2, 0x10011
-; GFX12-NEXT: s_bfe_i32 s11, s2, 0x10010
-; GFX12-NEXT: s_bfe_i32 s12, s2, 0x10017
-; GFX12-NEXT: s_bfe_i32 s3, s2, 0x10016
-; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10014
+; GFX12-NEXT: v_lshrrev_b16 v0, 12, s0
+; GFX12-NEXT: v_lshrrev_b16 v8, 13, s0
+; GFX12-NEXT: v_lshrrev_b16 v32, 15, s0
+; GFX12-NEXT: v_lshrrev_b16 v12, 4, s2
+; GFX12-NEXT: v_lshrrev_b16 v13, 5, s2
+; GFX12-NEXT: v_lshrrev_b16 v14, 6, s2
+; GFX12-NEXT: v_lshrrev_b16 v15, 7, s2
+; GFX12-NEXT: v_lshrrev_b16 v40, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v41, 9, s0
+; GFX12-NEXT: v_lshrrev_b16 v42, 10, s0
+; GFX12-NEXT: v_lshrrev_b16 v43, 11, s0
+; GFX12-NEXT: v_lshrrev_b16 v36, 4, s0
+; GFX12-NEXT: v_lshrrev_b16 v37, 5, s0
+; GFX12-NEXT: v_lshrrev_b16 v38, 6, s0
+; GFX12-NEXT: v_lshrrev_b16 v39, 7, s0
+; GFX12-NEXT: v_lshrrev_b16 v33, 1, s0
+; GFX12-NEXT: v_lshrrev_b16 v34, 2, s0
+; GFX12-NEXT: v_lshrrev_b16 v35, 3, s0
+; GFX12-NEXT: v_lshrrev_b16 v9, 1, s2
+; GFX12-NEXT: v_lshrrev_b16 v10, 2, s2
+; GFX12-NEXT: v_lshrrev_b16 v11, 3, s2
+; GFX12-NEXT: s_bfe_i32 s2, s0, 0x10018
+; GFX12-NEXT: s_bfe_i32 s7, s0, 0x10000
+; GFX12-NEXT: s_bfe_i32 s8, s0, 0x10013
+; GFX12-NEXT: s_bfe_i32 s9, s0, 0x10012
+; GFX12-NEXT: s_bfe_i32 s10, s0, 0x10011
+; GFX12-NEXT: s_bfe_i32 s11, s0, 0x10010
+; GFX12-NEXT: s_bfe_i32 s12, s0, 0x10017
+; GFX12-NEXT: s_bfe_i32 s1, s0, 0x10016
+; GFX12-NEXT: s_bfe_i32 s0, s0, 0x10014
; GFX12-NEXT: v_bfe_i32 v23, v23, 0, 1
; GFX12-NEXT: v_bfe_i32 v22, v22, 0, 1
; GFX12-NEXT: v_bfe_i32 v21, v21, 0, 1
@@ -4329,10 +4329,10 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v29, v29, 0, 1
; GFX12-NEXT: v_bfe_i32 v28, v28, 0, 1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v56, v[48:51], s[0:1] offset:208
-; GFX12-NEXT: global_store_b128 v56, v[52:55], s[0:1] offset:192
-; GFX12-NEXT: v_dual_mov_b32 v49, s13 :: v_dual_mov_b32 v48, s2
-; GFX12-NEXT: v_dual_mov_b32 v51, s12 :: v_dual_mov_b32 v50, s3
+; GFX12-NEXT: global_store_b128 v56, v[48:51], s[4:5] offset:208
+; GFX12-NEXT: global_store_b128 v56, v[52:55], s[4:5] offset:192
+; GFX12-NEXT: v_dual_mov_b32 v49, s13 :: v_dual_mov_b32 v48, s0
+; GFX12-NEXT: v_dual_mov_b32 v51, s12 :: v_dual_mov_b32 v50, s1
; GFX12-NEXT: v_mov_b32_e32 v53, s10
; GFX12-NEXT: v_bfe_i32 v19, v19, 0, 1
; GFX12-NEXT: v_bfe_i32 v18, v18, 0, 1
@@ -4362,7 +4362,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v42, v42, 0, 1
; GFX12-NEXT: v_bfe_i32 v41, v41, 0, 1
; GFX12-NEXT: v_bfe_i32 v40, v40, 0, 1
-; GFX12-NEXT: v_mov_b32_e32 v8, s5
+; GFX12-NEXT: v_mov_b32_e32 v8, s3
; GFX12-NEXT: v_bfe_i32 v6, v6, 0, 1
; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 1
; GFX12-NEXT: v_bfe_i32 v4, v4, 0, 1
@@ -4376,22 +4376,22 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v33, v33, 0, 1
; GFX12-NEXT: v_mov_b32_e32 v32, s7
; GFX12-NEXT: s_clause 0x7
-; GFX12-NEXT: global_store_b128 v56, v[48:51], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v56, v[52:55], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v56, v[44:47], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v56, v[40:43], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v56, v[36:39], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v56, v[32:35], s[0:1]
-; GFX12-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:176
-; GFX12-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:160
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: global_store_b128 v56, v[48:51], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v56, v[52:55], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v56, v[44:47], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v56, v[40:43], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v56, v[36:39], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v56, v[32:35], s[4:5]
+; GFX12-NEXT: global_store_b128 v56, v[28:31], s[4:5] offset:176
+; GFX12-NEXT: global_store_b128 v56, v[24:27], s[4:5] offset:160
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:144
-; GFX12-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:128
-; GFX12-NEXT: global_store_b128 v56, v[12:15], s[0:1] offset:240
-; GFX12-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:224
-; GFX12-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v56, v[0:3], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v56, v[20:23], s[4:5] offset:144
+; GFX12-NEXT: global_store_b128 v56, v[16:19], s[4:5] offset:128
+; GFX12-NEXT: global_store_b128 v56, v[12:15], s[4:5] offset:240
+; GFX12-NEXT: global_store_b128 v56, v[8:11], s[4:5] offset:224
+; GFX12-NEXT: global_store_b128 v56, v[4:7], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v56, v[0:3], s[4:5] offset:96
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4423,14 +4423,14 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: constant_zextload_i1_to_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -4455,14 +4455,14 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_zextload_i1_to_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s2, s2, 1
+; GFX12-NEXT: s_and_b32 s0, s0, 1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4494,13 +4494,13 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: constant_sextload_i1_to_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
@@ -4527,14 +4527,14 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_sextload_i1_to_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000
; GFX12-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4566,14 +4566,14 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v1i1_to_v1i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -4598,14 +4598,14 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v1i1_to_v1i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s2, s2, 1
+; GFX12-NEXT: s_and_b32 s0, s0, 1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4637,13 +4637,13 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v1i1_to_v1i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
@@ -4670,14 +4670,14 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v1i1_to_v1i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000
; GFX12-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4711,14 +4711,14 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v2i1_to_v2i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v3, 1, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v0
@@ -4749,17 +4749,17 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v2i1_to_v2i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v1, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v2, 1, v0
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 1, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4794,13 +4794,13 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v2i1_to_v2i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v0
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1
@@ -4833,10 +4833,10 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v2i1_to_v2i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v4, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v4, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1
@@ -4845,7 +4845,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4882,17 +4882,17 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v3i1_to_v3i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v5, 0
; GFX8-NEXT: v_mov_b32_e32 v3, v5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v7, s3
-; GFX8-NEXT: v_mov_b32_e32 v6, s2
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v7, s1
+; GFX8-NEXT: v_mov_b32_e32 v6, s0
; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v0
@@ -4900,10 +4900,10 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
; GFX8-NEXT: v_and_b32_e32 v8, 1, v0
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
; GFX8-NEXT: flat_store_dwordx2 v[6:7], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v8
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
@@ -4935,10 +4935,10 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v3i1_to_v3i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v5, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v5, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v5, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v2, 1, v0
; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0
@@ -4950,8 +4950,8 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
; GFX12-NEXT: v_dual_mov_b32 v3, v5 :: v_dual_and_b32 v4, 0xffff, v3
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v6
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b64 v5, v[4:5], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v5, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b64 v5, v[4:5], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v5, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4990,17 +4990,17 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v3i1_to_v3i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v7, s3
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v6, s2
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v7, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v6, s0
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 2, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v0
@@ -5045,10 +5045,10 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v3i1_to_v3i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v6, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v6, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v6, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 2, v0
; GFX12-NEXT: v_lshrrev_b16 v2, 1, v0
@@ -5062,8 +5062,8 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v6, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b64 v6, v[4:5], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v6, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5102,21 +5102,21 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v4i1_to_v4i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NEXT: v_mov_b32_e32 v11, s1
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v9, s1
-; GFX8-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NEXT: v_mov_b32_e32 v9, s5
+; GFX8-NEXT: v_mov_b32_e32 v10, s0
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v1
-; GFX8-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NEXT: v_mov_b32_e32 v8, s4
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v4, 2, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v0
@@ -5162,10 +5162,10 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v4i1_to_v4i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v1, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v6, 1, v0
; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0
@@ -5182,8 +5182,8 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v9
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v1, v[4:7], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5225,17 +5225,17 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v4i1_to_v4i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v11, s3
-; GFX8-NEXT: v_mov_b32_e32 v9, s1
-; GFX8-NEXT: v_mov_b32_e32 v10, s2
-; GFX8-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v11, s1
+; GFX8-NEXT: v_mov_b32_e32 v9, s5
+; GFX8-NEXT: v_mov_b32_e32 v10, s0
+; GFX8-NEXT: v_mov_b32_e32 v8, s4
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 2, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v4, 3, v0
@@ -5286,10 +5286,10 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v4i1_to_v4i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v8, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v8, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 3, v0
; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0
@@ -5307,8 +5307,8 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5357,24 +5357,24 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v8i1_to_v8i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v19, s3
-; GFX8-NEXT: v_mov_b32_e32 v18, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: v_mov_b32_e32 v17, s1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v16, s0
-; GFX8-NEXT: s_add_u32 s0, s0, 16
+; GFX8-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v19, s1
+; GFX8-NEXT: v_mov_b32_e32 v18, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v21, s1
+; GFX8-NEXT: v_mov_b32_e32 v20, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
+; GFX8-NEXT: v_mov_b32_e32 v17, s5
; GFX8-NEXT: v_mov_b32_e32 v23, s1
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v1
@@ -5382,7 +5382,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
; GFX8-NEXT: v_mov_b32_e32 v11, v1
; GFX8-NEXT: v_mov_b32_e32 v13, v1
; GFX8-NEXT: v_mov_b32_e32 v15, v1
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
+; GFX8-NEXT: v_mov_b32_e32 v16, s4
; GFX8-NEXT: v_mov_b32_e32 v22, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 2, v0
@@ -5457,10 +5457,10 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v8i1_to_v8i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v1, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v12, 1, v0
; GFX12-NEXT: v_lshrrev_b16 v4, 5, v0
@@ -5482,10 +5482,10 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v18
; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v14
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v1, v[4:7], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v1, v[8:11], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v1, v[12:15], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5541,24 +5541,24 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v8i1_to_v8i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v19, s3
-; GFX8-NEXT: v_mov_b32_e32 v18, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: v_mov_b32_e32 v17, s1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v16, s0
-; GFX8-NEXT: s_add_u32 s0, s0, 16
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
+; GFX8-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v19, s1
+; GFX8-NEXT: v_mov_b32_e32 v18, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v21, s1
+; GFX8-NEXT: v_mov_b32_e32 v20, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v17, s5
; GFX8-NEXT: v_mov_b32_e32 v23, s1
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
+; GFX8-NEXT: v_mov_b32_e32 v16, s4
; GFX8-NEXT: v_mov_b32_e32 v22, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 6, v0
@@ -5646,10 +5646,10 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v8i1_to_v8i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v16, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v1, v16, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v16, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v3, 6, v1
; GFX12-NEXT: v_lshrrev_b16 v5, 7, v1
@@ -5675,10 +5675,10 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5750,21 +5750,21 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_zextload_v16i1_to_v16i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: v_mov_b32_e32 v6, v2
; GFX8-NEXT: v_mov_b32_e32 v8, v2
; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 0x70
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: s_add_u32 s4, s0, 0x50
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v23, s5
-; GFX8-NEXT: v_mov_b32_e32 v22, s4
+; GFX8-NEXT: s_add_u32 s0, s4, 0x70
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: s_add_u32 s2, s4, 0x50
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v23, s3
+; GFX8-NEXT: v_mov_b32_e32 v22, s2
; GFX8-NEXT: v_mov_b32_e32 v9, v2
; GFX8-NEXT: v_mov_b32_e32 v11, v2
; GFX8-NEXT: v_mov_b32_e32 v12, v2
@@ -5781,49 +5781,49 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v1
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[5:8]
-; GFX8-NEXT: v_mov_b32_e32 v23, s3
+; GFX8-NEXT: v_mov_b32_e32 v23, s1
+; GFX8-NEXT: v_mov_b32_e32 v22, s0
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 14, v0
-; GFX8-NEXT: v_mov_b32_e32 v22, s2
+; GFX8-NEXT: s_add_u32 s0, s4, 64
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 15, v0
; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX8-NEXT: s_add_u32 s2, s0, 64
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v5, v2
; GFX8-NEXT: v_mov_b32_e32 v7, v2
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[1:4]
; GFX8-NEXT: v_mov_b32_e32 v23, v2
; GFX8-NEXT: v_mov_b32_e32 v3, 1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v1, s2
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: v_and_b32_sdwa v8, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 9, v0
-; GFX8-NEXT: s_add_u32 s2, s0, 0x60
+; GFX8-NEXT: s_add_u32 s0, s4, 0x60
; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v3
-; GFX8-NEXT: v_mov_b32_e32 v4, s3
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_lshrrev_b16_e32 v6, 12, v0
; GFX8-NEXT: flat_store_dwordx4 v[1:2], v[8:11]
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_mov_b32_e32 v9, s3
+; GFX8-NEXT: v_lshrrev_b16_e32 v16, 5, v0
+; GFX8-NEXT: v_mov_b32_e32 v9, s1
; GFX8-NEXT: v_and_b32_e32 v11, 1, v6
; GFX8-NEXT: v_lshrrev_b16_e32 v6, 13, v0
-; GFX8-NEXT: v_mov_b32_e32 v8, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v6
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: flat_store_dwordx4 v[3:4], v[11:14]
-; GFX8-NEXT: s_add_u32 s0, s0, 16
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 7, v0
+; GFX8-NEXT: v_mov_b32_e32 v11, s1
+; GFX8-NEXT: v_mov_b32_e32 v10, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
; GFX8-NEXT: v_lshrrev_b16_e32 v6, 6, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v14, 4, v0
-; GFX8-NEXT: v_lshrrev_b16_e32 v16, 5, v0
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_lshrrev_b16_e32 v4, 2, v0
; GFX8-NEXT: v_and_b32_e32 v18, 1, v14
; GFX8-NEXT: v_and_b32_e32 v14, 1, v6
@@ -5832,13 +5832,13 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX8-NEXT: v_lshrrev_b16_e32 v0, 1, v0
; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
; GFX8-NEXT: v_and_b32_e32 v16, 1, v16
-; GFX8-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: v_mov_b32_e32 v13, s1
; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v16
; GFX8-NEXT: v_and_b32_e32 v16, 0xffff, v3
-; GFX8-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s5
; GFX8-NEXT: v_mov_b32_e32 v12, s0
; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v0
@@ -5934,10 +5934,10 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v16i1_to_v16i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v1, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v28, 1, v0
; GFX12-NEXT: v_lshrrev_b16 v4, 11, v0
@@ -5981,14 +5981,14 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v26, 0xffff, v38
; GFX12-NEXT: v_and_b32_e32 v22, 0xffff, v37
; GFX12-NEXT: s_clause 0x7
-; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:96
-; GFX12-NEXT: global_store_b128 v1, v[16:19], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v1, v[20:23], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v1, v[24:27], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v1, v[28:31], s[0:1]
+; GFX12-NEXT: global_store_b128 v1, v[4:7], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v1, v[8:11], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v1, v[12:15], s[4:5] offset:96
+; GFX12-NEXT: global_store_b128 v1, v[16:19], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v1, v[20:23], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v1, v[24:27], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v1, v[28:31], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6073,40 +6073,40 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_sextload_v16i1_to_v16i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 0x70
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v6, s3
-; GFX8-NEXT: v_mov_b32_e32 v5, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0x60
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v8, s3
-; GFX8-NEXT: v_mov_b32_e32 v7, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0x50
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v10, s3
-; GFX8-NEXT: v_mov_b32_e32 v9, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 64
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v16, s3
-; GFX8-NEXT: v_mov_b32_e32 v15, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v23, s3
-; GFX8-NEXT: v_mov_b32_e32 v22, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: v_mov_b32_e32 v21, s1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v20, s0
-; GFX8-NEXT: s_add_u32 s0, s0, 16
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v25, s3
+; GFX8-NEXT: s_add_u32 s0, s4, 0x70
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s1
+; GFX8-NEXT: v_mov_b32_e32 v5, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x60
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v8, s1
+; GFX8-NEXT: v_mov_b32_e32 v7, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x50
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v10, s1
+; GFX8-NEXT: v_mov_b32_e32 v9, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 64
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v16, s1
+; GFX8-NEXT: v_mov_b32_e32 v15, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v23, s1
+; GFX8-NEXT: v_mov_b32_e32 v22, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v25, s1
+; GFX8-NEXT: v_mov_b32_e32 v24, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v21, s5
; GFX8-NEXT: v_mov_b32_e32 v27, s1
-; GFX8-NEXT: v_mov_b32_e32 v24, s2
+; GFX8-NEXT: v_mov_b32_e32 v20, s4
; GFX8-NEXT: v_mov_b32_e32 v26, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 14, v0
@@ -6267,10 +6267,10 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v16i1_to_v16i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v32, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v32, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v32, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v3, 14, v1
; GFX12-NEXT: v_lshrrev_b16 v5, 15, v1
@@ -6320,14 +6320,14 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GFX12-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GFX12-NEXT: s_clause 0x7
-; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:96
-; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v32, v[28:31], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v32, v[24:27], s[4:5] offset:96
+; GFX12-NEXT: global_store_b128 v32, v[20:23], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v32, v[16:19], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v32, v[12:15], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v32, v[8:11], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v32, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v32, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6447,86 +6447,86 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_zextload_v32i1_to_v32i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s0
; GFX8-NEXT: v_and_b32_e32 v15, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s0
; GFX8-NEXT: v_and_b32_e32 v8, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 7, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 7, s0
; GFX8-NEXT: v_and_b32_e32 v11, 1, v2
; GFX8-NEXT: v_and_b32_e32 v2, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s2
-; GFX8-NEXT: s_lshr_b32 s14, s2, 24
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s0
+; GFX8-NEXT: s_lshr_b32 s14, s0, 24
; GFX8-NEXT: v_and_b32_e32 v5, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s2
-; GFX8-NEXT: s_bfe_u32 s10, s2, 0x10018
-; GFX8-NEXT: s_and_b32 s11, s2, 1
-; GFX8-NEXT: s_bfe_u32 s15, s2, 0x10011
-; GFX8-NEXT: s_bfe_u32 s16, s2, 0x10010
-; GFX8-NEXT: s_bfe_u32 s17, s2, 0x10012
-; GFX8-NEXT: s_bfe_u32 s18, s2, 0x10013
-; GFX8-NEXT: s_bfe_u32 s19, s2, 0x10014
-; GFX8-NEXT: s_bfe_u32 s20, s2, 0x10015
-; GFX8-NEXT: s_bfe_u32 s21, s2, 0x10016
-; GFX8-NEXT: s_bfe_u32 s22, s2, 0x10017
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 14, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 12, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v9, 10, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v7, 4, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s0
+; GFX8-NEXT: s_bfe_u32 s10, s0, 0x10018
+; GFX8-NEXT: s_and_b32 s11, s0, 1
+; GFX8-NEXT: s_bfe_u32 s15, s0, 0x10011
+; GFX8-NEXT: s_bfe_u32 s16, s0, 0x10010
+; GFX8-NEXT: s_bfe_u32 s17, s0, 0x10012
+; GFX8-NEXT: s_bfe_u32 s18, s0, 0x10013
+; GFX8-NEXT: s_bfe_u32 s19, s0, 0x10014
+; GFX8-NEXT: s_bfe_u32 s20, s0, 0x10015
+; GFX8-NEXT: s_bfe_u32 s21, s0, 0x10016
+; GFX8-NEXT: s_bfe_u32 s22, s0, 0x10017
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v4, 14, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v14, 12, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v9, 10, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v3, 6, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v7, 4, s0
; GFX8-NEXT: v_and_b32_e32 v10, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v12, 2, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 15, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0xb0
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: s_add_u32 s4, s0, 0xa0
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: s_add_u32 s6, s0, 0x90
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
-; GFX8-NEXT: s_add_u32 s8, s0, 0x80
-; GFX8-NEXT: s_addc_u32 s9, s1, 0
-; GFX8-NEXT: s_add_u32 s12, s0, 0x70
+; GFX8-NEXT: v_lshrrev_b16_e64 v12, 2, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 15, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xb0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: s_add_u32 s2, s4, 0xa0
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: s_add_u32 s6, s4, 0x90
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
+; GFX8-NEXT: s_add_u32 s8, s4, 0x80
+; GFX8-NEXT: s_addc_u32 s9, s5, 0
+; GFX8-NEXT: s_add_u32 s12, s4, 0x70
; GFX8-NEXT: v_and_b32_e32 v16, 1, v1
; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s14
-; GFX8-NEXT: s_addc_u32 s13, s1, 0
+; GFX8-NEXT: s_addc_u32 s13, s5, 0
; GFX8-NEXT: v_and_b32_e32 v17, 1, v1
; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s14
; GFX8-NEXT: v_mov_b32_e32 v23, s13
; GFX8-NEXT: v_and_b32_e32 v25, 1, v1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_mov_b32_e32 v22, s12
-; GFX8-NEXT: s_add_u32 s12, s0, 0xf0
+; GFX8-NEXT: s_add_u32 s12, s4, 0xf0
; GFX8-NEXT: v_and_b32_e32 v18, 1, v4
; GFX8-NEXT: v_mov_b32_e32 v19, v1
; GFX8-NEXT: v_mov_b32_e32 v21, v1
-; GFX8-NEXT: s_addc_u32 s13, s1, 0
+; GFX8-NEXT: s_addc_u32 s13, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21]
; GFX8-NEXT: v_mov_b32_e32 v23, s13
; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s14
; GFX8-NEXT: v_mov_b32_e32 v22, s12
-; GFX8-NEXT: s_add_u32 s12, s0, 0x60
+; GFX8-NEXT: s_add_u32 s12, s4, 0x60
; GFX8-NEXT: v_and_b32_e32 v18, 1, v6
; GFX8-NEXT: v_lshrrev_b16_e64 v20, 7, s14
-; GFX8-NEXT: s_addc_u32 s13, s1, 0
+; GFX8-NEXT: s_addc_u32 s13, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21]
; GFX8-NEXT: v_lshrrev_b16_e64 v24, 4, s14
; GFX8-NEXT: v_and_b32_e32 v18, 1, v14
; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v15
; GFX8-NEXT: v_mov_b32_e32 v15, s13
; GFX8-NEXT: v_mov_b32_e32 v14, s12
-; GFX8-NEXT: s_add_u32 s12, s0, 0x50
-; GFX8-NEXT: s_addc_u32 s13, s1, 0
+; GFX8-NEXT: s_add_u32 s12, s4, 0x50
+; GFX8-NEXT: s_addc_u32 s13, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v23, s13
; GFX8-NEXT: v_mov_b32_e32 v22, s12
-; GFX8-NEXT: s_add_u32 s12, s0, 64
+; GFX8-NEXT: s_add_u32 s12, s4, 64
; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[18:21]
-; GFX8-NEXT: s_addc_u32 s13, s1, 0
+; GFX8-NEXT: s_addc_u32 s13, s5, 0
; GFX8-NEXT: v_and_b32_e32 v18, 1, v9
; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v11
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21]
@@ -6539,17 +6539,17 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21]
; GFX8-NEXT: v_and_b32_e32 v23, 0xffff, v2
; GFX8-NEXT: v_and_b32_e32 v18, 1, v7
-; GFX8-NEXT: v_mov_b32_e32 v8, s3
+; GFX8-NEXT: v_mov_b32_e32 v8, s1
; GFX8-NEXT: v_and_b32_e32 v21, 1, v3
; GFX8-NEXT: v_mov_b32_e32 v0, s21
; GFX8-NEXT: v_mov_b32_e32 v2, s22
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NEXT: v_mov_b32_e32 v7, s0
; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v8, s5
+; GFX8-NEXT: v_mov_b32_e32 v8, s3
; GFX8-NEXT: v_mov_b32_e32 v0, s19
; GFX8-NEXT: v_mov_b32_e32 v2, s20
-; GFX8-NEXT: v_mov_b32_e32 v7, s4
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v8, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s17
@@ -6557,62 +6557,62 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v7, s6
; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v7, s8
+; GFX8-NEXT: s_add_u32 s0, s4, 48
; GFX8-NEXT: v_mov_b32_e32 v0, s16
; GFX8-NEXT: v_mov_b32_e32 v2, s15
; GFX8-NEXT: v_mov_b32_e32 v8, s9
-; GFX8-NEXT: s_add_u32 s2, s0, 48
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_and_b32_e32 v15, 1, v24
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
; GFX8-NEXT: v_mov_b32_e32 v22, v1
; GFX8-NEXT: v_mov_b32_e32 v24, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[21:24]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v5
; GFX8-NEXT: v_mov_b32_e32 v21, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[18:21]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v16
; GFX8-NEXT: v_and_b32_e32 v9, 1, v12
; GFX8-NEXT: v_mov_b32_e32 v10, v1
; GFX8-NEXT: v_mov_b32_e32 v12, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: v_mov_b32_e32 v8, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v8, s5
+; GFX8-NEXT: s_add_u32 s0, s4, 0xe0
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[9:12]
; GFX8-NEXT: v_mov_b32_e32 v0, s11
; GFX8-NEXT: v_mov_b32_e32 v2, v14
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v7, s0
-; GFX8-NEXT: s_add_u32 s2, s0, 0xe0
+; GFX8-NEXT: v_mov_b32_e32 v7, s4
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0xd0
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 1, s14
; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xd0
; GFX8-NEXT: v_mov_b32_e32 v16, v1
; GFX8-NEXT: v_mov_b32_e32 v18, v1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_lshrrev_b16_e64 v4, 1, s14
+; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[15:18]
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_lshrrev_b16_e64 v26, 2, s14
; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[15:18]
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: s_add_u32 s0, s0, 0xc0
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xc0
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v4
; GFX8-NEXT: v_and_b32_e32 v4, 1, v26
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v25
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
; GFX8-NEXT: v_mov_b32_e32 v0, s10
; GFX8-NEXT: v_mov_b32_e32 v5, s1
@@ -6783,71 +6783,71 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v32i1_to_v32i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v0, 13, s2
-; GFX12-NEXT: v_lshrrev_b16 v3, 11, s2
-; GFX12-NEXT: s_lshr_b32 s3, s2, 24
+; GFX12-NEXT: v_lshrrev_b16 v0, 13, s0
+; GFX12-NEXT: v_lshrrev_b16 v3, 11, s0
+; GFX12-NEXT: s_lshr_b32 s1, s0, 24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: v_lshrrev_b16 v2, 12, s2
+; GFX12-NEXT: v_lshrrev_b16 v2, 12, s0
; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT: v_lshrrev_b16 v4, 9, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v16, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v18, 6, s3
-; GFX12-NEXT: v_lshrrev_b16 v17, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v20, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 3, s3
-; GFX12-NEXT: v_lshrrev_b16 v22, 2, s3
-; GFX12-NEXT: v_lshrrev_b16 v23, 1, s3
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10016
+; GFX12-NEXT: v_lshrrev_b16 v4, 9, s0
+; GFX12-NEXT: v_lshrrev_b16 v8, 7, s0
+; GFX12-NEXT: v_lshrrev_b16 v16, 7, s1
+; GFX12-NEXT: v_lshrrev_b16 v18, 6, s1
+; GFX12-NEXT: v_lshrrev_b16 v17, 5, s1
+; GFX12-NEXT: v_lshrrev_b16 v20, 4, s1
+; GFX12-NEXT: v_lshrrev_b16 v21, 3, s1
+; GFX12-NEXT: v_lshrrev_b16 v22, 2, s1
+; GFX12-NEXT: v_lshrrev_b16 v23, 1, s1
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10016
; GFX12-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10017
-; GFX12-NEXT: v_lshrrev_b16 v11, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v13, 3, s2
+; GFX12-NEXT: s_bfe_u32 s6, s0, 0x10017
+; GFX12-NEXT: v_lshrrev_b16 v11, 5, s0
+; GFX12-NEXT: v_lshrrev_b16 v13, 3, s0
; GFX12-NEXT: v_and_b32_e32 v24, 1, v4
; GFX12-NEXT: v_and_b32_e32 v25, 1, v8
; GFX12-NEXT: v_and_b32_e32 v28, 1, v21
; GFX12-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_and_b32 v31, 1, v2
; GFX12-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_and_b32 v33, 0xffff, v0
-; GFX12-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_and_b32 v21, 0xffff, v3
+; GFX12-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_and_b32 v21, 0xffff, v3
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10014
-; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10015
-; GFX12-NEXT: v_lshrrev_b16 v9, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v15, 1, s2
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10014
+; GFX12-NEXT: s_bfe_u32 s6, s0, 0x10015
+; GFX12-NEXT: v_lshrrev_b16 v9, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v15, 1, s0
; GFX12-NEXT: v_and_b32_e32 v11, 1, v11
; GFX12-NEXT: v_and_b32_e32 v13, 1, v13
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:176
; GFX12-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10012
-; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10013
-; GFX12-NEXT: v_lshrrev_b16 v6, 10, s2
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10012
+; GFX12-NEXT: s_bfe_u32 s6, s0, 0x10013
+; GFX12-NEXT: v_lshrrev_b16 v6, 10, s0
; GFX12-NEXT: v_and_b32_e32 v26, 1, v15
; GFX12-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_and_b32 v15, 1, v9
; GFX12-NEXT: v_and_b32_e32 v9, 1, v17
; GFX12-NEXT: v_and_b32_e32 v29, 1, v23
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:160
; GFX12-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_lshrrev_b16 v5, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v10, 6, s2
-; GFX12-NEXT: v_lshrrev_b16 v12, 4, s2
-; GFX12-NEXT: v_lshrrev_b16 v14, 2, s2
-; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018
-; GFX12-NEXT: s_and_b32 s5, s2, 1
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10011
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10010
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: v_lshrrev_b16 v5, 15, s0
+; GFX12-NEXT: v_lshrrev_b16 v7, 14, s0
+; GFX12-NEXT: v_lshrrev_b16 v10, 6, s0
+; GFX12-NEXT: v_lshrrev_b16 v12, 4, s0
+; GFX12-NEXT: v_lshrrev_b16 v14, 2, s0
+; GFX12-NEXT: s_bfe_u32 s2, s0, 0x10018
+; GFX12-NEXT: s_and_b32 s3, s0, 1
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10011
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x10010
; GFX12-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_and_b32 v19, 1, v6
; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v13
; GFX12-NEXT: v_and_b32_e32 v17, 0xffff, v24
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
-; GFX12-NEXT: v_mov_b32_e32 v2, s3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v13, v1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:144
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v13, v1
; GFX12-NEXT: v_and_b32_e32 v43, 0xffff, v26
; GFX12-NEXT: v_and_b32_e32 v4, 1, v14
; GFX12-NEXT: v_and_b32_e32 v8, 1, v12
@@ -6856,9 +6856,9 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v37, 0xffff, v16
; GFX12-NEXT: v_and_b32_e32 v39, 1, v7
; GFX12-NEXT: v_dual_mov_b32 v16, v1 :: v_dual_and_b32 v41, 0xffff, v5
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:128
; GFX12-NEXT: v_mov_b32_e32 v5, v1
-; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v0, s5
+; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v0, s3
; GFX12-NEXT: v_dual_mov_b32 v2, v43 :: v_dual_and_b32 v29, 0xffff, v9
; GFX12-NEXT: v_dual_mov_b32 v40, v1 :: v_dual_and_b32 v23, 1, v22
; GFX12-NEXT: v_dual_mov_b32 v30, v1 :: v_dual_and_b32 v27, 1, v20
@@ -6870,26 +6870,26 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v25
; GFX12-NEXT: v_mov_b32_e32 v24, v1
; GFX12-NEXT: s_clause 0x4
-; GFX12-NEXT: global_store_b128 v1, v[35:38], s[0:1] offset:240
-; GFX12-NEXT: global_store_b128 v1, v[39:42], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v1, v[31:34], s[0:1] offset:96
-; GFX12-NEXT: global_store_b128 v1, v[19:22], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v1, v[15:18], s[0:1] offset:64
+; GFX12-NEXT: global_store_b128 v1, v[35:38], s[4:5] offset:240
+; GFX12-NEXT: global_store_b128 v1, v[39:42], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v1, v[31:34], s[4:5] offset:96
+; GFX12-NEXT: global_store_b128 v1, v[19:22], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v1, v[15:18], s[4:5] offset:64
; GFX12-NEXT: v_mov_b32_e32 v15, v1
; GFX12-NEXT: v_mov_b32_e32 v11, v1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: global_store_b128 v1, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_dual_mov_b32 v2, v44 :: v_dual_mov_b32 v9, v1
; GFX12-NEXT: v_dual_mov_b32 v26, v1 :: v_dual_and_b32 v25, 0xffff, v28
; GFX12-NEXT: v_mov_b32_e32 v28, v1
; GFX12-NEXT: s_clause 0x4
-; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v1, v[27:30], s[0:1] offset:224
-; GFX12-NEXT: global_store_b128 v1, v[23:26], s[0:1] offset:208
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192
+; GFX12-NEXT: global_store_b128 v1, v[12:15], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v1, v[8:11], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v1, v[27:30], s[4:5] offset:224
+; GFX12-NEXT: global_store_b128 v1, v[23:26], s[4:5] offset:208
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:192
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -7067,43 +7067,43 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_sextload_v32i1_to_v32i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshr_b32 s6, s4, 22
-; GFX8-NEXT: s_lshr_b32 s8, s4, 23
-; GFX8-NEXT: s_lshr_b32 s10, s4, 20
-; GFX8-NEXT: s_lshr_b32 s12, s4, 21
-; GFX8-NEXT: s_lshr_b32 s14, s4, 18
-; GFX8-NEXT: s_lshr_b32 s16, s4, 19
-; GFX8-NEXT: s_lshr_b32 s18, s4, 16
-; GFX8-NEXT: s_lshr_b32 s20, s4, 17
-; GFX8-NEXT: s_lshr_b32 s2, s4, 24
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 14, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 15, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v15, 12, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v17, 13, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v13, 10, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 11, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v9, 8, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v10, 9, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v8, 7, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 4, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v5, 5, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 2, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v7, 1, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v11, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v12, 7, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v16, 4, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v18, 5, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 2, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 3, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v27, 1, s2
+; GFX8-NEXT: s_lshr_b32 s6, s2, 22
+; GFX8-NEXT: s_lshr_b32 s8, s2, 23
+; GFX8-NEXT: s_lshr_b32 s10, s2, 20
+; GFX8-NEXT: s_lshr_b32 s12, s2, 21
+; GFX8-NEXT: s_lshr_b32 s14, s2, 18
+; GFX8-NEXT: s_lshr_b32 s16, s2, 19
+; GFX8-NEXT: s_lshr_b32 s18, s2, 16
+; GFX8-NEXT: s_lshr_b32 s20, s2, 17
+; GFX8-NEXT: s_lshr_b32 s0, s2, 24
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 14, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v3, 15, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v15, 12, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v17, 13, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v13, 10, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v14, 11, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v9, 8, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v10, 9, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v8, 7, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v4, 4, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v5, 5, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 2, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v7, 1, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v11, 6, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v12, 7, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v16, 4, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v18, 5, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v19, 2, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 3, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v27, 1, s0
+; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000
; GFX8-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX8-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
; GFX8-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
; GFX8-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
@@ -7113,33 +7113,33 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
; GFX8-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
; GFX8-NEXT: v_mov_b32_e32 v21, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 0xb0
+; GFX8-NEXT: s_add_u32 s6, s4, 0xb0
; GFX8-NEXT: v_mov_b32_e32 v22, s7
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v26, s7
; GFX8-NEXT: v_mov_b32_e32 v25, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 0xa0
+; GFX8-NEXT: s_add_u32 s6, s4, 0xa0
; GFX8-NEXT: v_mov_b32_e32 v23, s8
; GFX8-NEXT: v_mov_b32_e32 v24, s9
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[25:26], v[21:24]
; GFX8-NEXT: v_mov_b32_e32 v26, s7
; GFX8-NEXT: v_mov_b32_e32 v25, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 0x90
+; GFX8-NEXT: s_add_u32 s6, s4, 0x90
; GFX8-NEXT: v_mov_b32_e32 v21, s10
; GFX8-NEXT: v_mov_b32_e32 v22, s11
; GFX8-NEXT: v_mov_b32_e32 v23, s12
; GFX8-NEXT: v_mov_b32_e32 v24, s13
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[25:26], v[21:24]
; GFX8-NEXT: v_mov_b32_e32 v26, s7
; GFX8-NEXT: v_mov_b32_e32 v25, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 0x80
+; GFX8-NEXT: s_add_u32 s6, s4, 0x80
; GFX8-NEXT: v_mov_b32_e32 v21, s14
; GFX8-NEXT: v_mov_b32_e32 v22, s15
; GFX8-NEXT: v_mov_b32_e32 v23, s16
; GFX8-NEXT: v_mov_b32_e32 v24, s17
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[25:26], v[21:24]
; GFX8-NEXT: v_mov_b32_e32 v26, s7
; GFX8-NEXT: v_mov_b32_e32 v21, s18
@@ -7147,15 +7147,15 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v23, s20
; GFX8-NEXT: v_mov_b32_e32 v24, s21
; GFX8-NEXT: v_mov_b32_e32 v25, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 0x70
+; GFX8-NEXT: s_add_u32 s6, s4, 0x70
; GFX8-NEXT: flat_store_dwordx4 v[25:26], v[21:24]
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_bfe_i32 v23, v3, 0, 1
; GFX8-NEXT: v_bfe_i32 v21, v2, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NEXT: s_add_u32 s6, s0, 0x60
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_add_u32 s6, s4, 0x60
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GFX8-NEXT: v_ashrrev_i32_e32 v22, 31, v21
; GFX8-NEXT: v_mov_b32_e32 v26, s7
@@ -7163,29 +7163,29 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v25, s6
; GFX8-NEXT: v_bfe_i32 v23, v17, 0, 1
; GFX8-NEXT: v_bfe_i32 v21, v15, 0, 1
-; GFX8-NEXT: s_add_u32 s6, s0, 0x50
+; GFX8-NEXT: s_add_u32 s6, s4, 0x50
; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GFX8-NEXT: v_ashrrev_i32_e32 v22, 31, v21
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[25:26], v[21:24]
; GFX8-NEXT: v_bfe_i32 v25, v14, 0, 1
; GFX8-NEXT: v_bfe_i32 v23, v13, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v14, s7
; GFX8-NEXT: v_mov_b32_e32 v13, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 64
+; GFX8-NEXT: s_add_u32 s6, s4, 64
; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25
; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[23:26]
; GFX8-NEXT: v_bfe_i32 v12, v12, 0, 1
; GFX8-NEXT: v_bfe_i32 v25, v10, 0, 1
; GFX8-NEXT: v_bfe_i32 v23, v9, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v10, s7
; GFX8-NEXT: v_mov_b32_e32 v9, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 48
+; GFX8-NEXT: s_add_u32 s6, s4, 48
; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25
; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[23:26]
; GFX8-NEXT: v_bfe_i32 v10, v11, 0, 1
; GFX8-NEXT: v_bfe_i32 v25, v8, 0, 1
@@ -7194,18 +7194,18 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25
; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GFX8-NEXT: v_mov_b32_e32 v8, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 32
+; GFX8-NEXT: s_add_u32 s6, s4, 32
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[23:26]
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_bfe_i32 v25, v5, 0, 1
; GFX8-NEXT: v_bfe_i32 v23, v4, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25
; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GFX8-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NEXT: s_add_u32 s6, s0, 16
+; GFX8-NEXT: s_add_u32 s6, s4, 16
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[23:26]
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_bfe_i32 v25, v1, 0, 1
; GFX8-NEXT: v_bfe_i32 v23, v0, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v0, s6
@@ -7214,44 +7214,44 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_bfe_i32 v6, v7, 0, 1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[23:26]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: s_add_u32 s4, s0, 0xf0
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_add_u32 s2, s4, 0xf0
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v10
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_add_u32 s4, s0, 0xe0
-; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[10:13]
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_add_u32 s4, s0, 0xd0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_add_u32 s2, s4, 0xe0
; GFX8-NEXT: v_bfe_i32 v17, v18, 0, 1
; GFX8-NEXT: v_bfe_i32 v15, v16, 0, 1
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[10:13]
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_ashrrev_i32_e32 v18, 31, v17
; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; GFX8-NEXT: s_add_u32 s0, s0, 0xc0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_add_u32 s2, s4, 0xd0
; GFX8-NEXT: v_bfe_i32 v21, v20, 0, 1
; GFX8-NEXT: v_bfe_i32 v19, v19, 0, 1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[15:18]
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_bfe_i32 v2, v27, 0, 1
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_ashrrev_i32_e32 v22, 31, v21
; GFX8-NEXT: v_ashrrev_i32_e32 v20, 31, v19
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[19:22]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xc0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_bfe_i32 v2, v27, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[19:22]
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
@@ -7448,42 +7448,42 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v32i1_to_v32i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v26, 6, s2
-; GFX12-NEXT: v_lshrrev_b16 v28, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v4, 2, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 3, s2
-; GFX12-NEXT: s_lshr_b32 s22, s2, 24
-; GFX12-NEXT: s_lshr_b32 s12, s2, 22
-; GFX12-NEXT: s_lshr_b32 s14, s2, 23
-; GFX12-NEXT: v_lshrrev_b16 v6, 4, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v3, 1, s2
-; GFX12-NEXT: s_lshr_b32 s16, s2, 20
-; GFX12-NEXT: s_lshr_b32 s18, s2, 21
-; GFX12-NEXT: v_lshrrev_b16 v1, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v2, 15, s2
+; GFX12-NEXT: v_lshrrev_b16 v26, 6, s0
+; GFX12-NEXT: v_lshrrev_b16 v28, 7, s0
+; GFX12-NEXT: v_lshrrev_b16 v4, 2, s0
+; GFX12-NEXT: v_lshrrev_b16 v5, 3, s0
+; GFX12-NEXT: s_lshr_b32 s22, s0, 24
+; GFX12-NEXT: s_lshr_b32 s12, s0, 22
+; GFX12-NEXT: s_lshr_b32 s14, s0, 23
+; GFX12-NEXT: v_lshrrev_b16 v6, 4, s0
+; GFX12-NEXT: v_lshrrev_b16 v8, 5, s0
+; GFX12-NEXT: v_lshrrev_b16 v3, 1, s0
+; GFX12-NEXT: s_lshr_b32 s16, s0, 20
+; GFX12-NEXT: s_lshr_b32 s18, s0, 21
+; GFX12-NEXT: v_lshrrev_b16 v1, 14, s0
+; GFX12-NEXT: v_lshrrev_b16 v2, 15, s0
; GFX12-NEXT: v_lshrrev_b16 v12, 6, s22
; GFX12-NEXT: v_lshrrev_b16 v14, 7, s22
-; GFX12-NEXT: v_lshrrev_b16 v9, 12, s2
-; GFX12-NEXT: v_lshrrev_b16 v10, 13, s2
+; GFX12-NEXT: v_lshrrev_b16 v9, 12, s0
+; GFX12-NEXT: v_lshrrev_b16 v10, 13, s0
; GFX12-NEXT: v_lshrrev_b16 v16, 4, s22
; GFX12-NEXT: v_lshrrev_b16 v17, 5, s22
; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
-; GFX12-NEXT: s_lshr_b32 s4, s2, 18
-; GFX12-NEXT: v_lshrrev_b16 v37, 10, s2
-; GFX12-NEXT: v_lshrrev_b16 v34, 11, s2
+; GFX12-NEXT: s_lshr_b32 s2, s0, 18
+; GFX12-NEXT: v_lshrrev_b16 v37, 10, s0
+; GFX12-NEXT: v_lshrrev_b16 v34, 11, s0
; GFX12-NEXT: v_lshrrev_b16 v13, 2, s22
; GFX12-NEXT: v_lshrrev_b16 v15, 3, s22
; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v49, s12
-; GFX12-NEXT: v_lshrrev_b16 v30, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v32, 9, s2
+; GFX12-NEXT: v_lshrrev_b16 v30, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v32, 9, s0
; GFX12-NEXT: v_lshrrev_b16 v11, 1, s22
; GFX12-NEXT: v_bfe_i32 v7, v5, 0, 1
; GFX12-NEXT: v_bfe_i32 v5, v4, 0, 1
@@ -7491,15 +7491,15 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v29, v26, 0, 1
; GFX12-NEXT: v_dual_mov_b32 v50, s13 :: v_dual_mov_b32 v51, s14
; GFX12-NEXT: v_dual_mov_b32 v52, s15 :: v_dual_mov_b32 v53, s16
-; GFX12-NEXT: s_lshr_b32 s6, s2, 19
+; GFX12-NEXT: s_lshr_b32 s6, s0, 19
; GFX12-NEXT: v_bfe_i32 v3, v3, 0, 1
; GFX12-NEXT: v_bfe_i32 v27, v8, 0, 1
; GFX12-NEXT: v_bfe_i32 v25, v6, 0, 1
; GFX12-NEXT: v_dual_mov_b32 v54, s17 :: v_dual_mov_b32 v55, s18
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
; GFX12-NEXT: v_mov_b32_e32 v56, s19
-; GFX12-NEXT: s_lshr_b32 s10, s2, 16
-; GFX12-NEXT: s_lshr_b32 s20, s2, 17
+; GFX12-NEXT: s_lshr_b32 s10, s0, 16
+; GFX12-NEXT: s_lshr_b32 s20, s0, 17
; GFX12-NEXT: v_bfe_i32 v23, v14, 0, 1
; GFX12-NEXT: v_bfe_i32 v21, v12, 0, 1
; GFX12-NEXT: v_bfe_i32 v47, v2, 0, 1
@@ -7509,7 +7509,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v43, v10, 0, 1
; GFX12-NEXT: v_bfe_i32 v41, v9, 0, 1
; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX12-NEXT: s_bfe_i64 s[8:9], s[2:3], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[8:9], s[0:1], 0x10000
; GFX12-NEXT: v_bfe_i32 v15, v15, 0, 1
; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 1
; GFX12-NEXT: v_bfe_i32 v39, v34, 0, 1
@@ -7524,18 +7524,18 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v33, v30, 0, 1
; GFX12-NEXT: v_ashrrev_i32_e32 v30, 31, v29
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v0, v[49:52], s[0:1] offset:176
-; GFX12-NEXT: global_store_b128 v0, v[53:56], s[0:1] offset:160
-; GFX12-NEXT: v_dual_mov_b32 v49, s4 :: v_dual_mov_b32 v50, s5
+; GFX12-NEXT: global_store_b128 v0, v[49:52], s[4:5] offset:176
+; GFX12-NEXT: global_store_b128 v0, v[53:56], s[4:5] offset:160
+; GFX12-NEXT: v_dual_mov_b32 v49, s2 :: v_dual_mov_b32 v50, s3
; GFX12-NEXT: v_dual_mov_b32 v51, s6 :: v_dual_mov_b32 v52, s7
; GFX12-NEXT: v_mov_b32_e32 v53, s10
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[22:23], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[22:23], 0x10000
; GFX12-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX12-NEXT: v_ashrrev_i32_e32 v28, 31, v27
; GFX12-NEXT: v_ashrrev_i32_e32 v26, 31, v25
; GFX12-NEXT: v_dual_mov_b32 v54, s11 :: v_dual_mov_b32 v55, s20
; GFX12-NEXT: v_dual_mov_b32 v56, s21 :: v_dual_mov_b32 v1, s8
-; GFX12-NEXT: v_dual_mov_b32 v2, s9 :: v_dual_mov_b32 v9, s2
+; GFX12-NEXT: v_dual_mov_b32 v2, s9 :: v_dual_mov_b32 v9, s0
; GFX12-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GFX12-NEXT: v_ashrrev_i32_e32 v22, 31, v21
; GFX12-NEXT: v_ashrrev_i32_e32 v48, 31, v47
@@ -7552,22 +7552,22 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i32_e32 v36, 31, v35
; GFX12-NEXT: v_ashrrev_i32_e32 v34, 31, v33
; GFX12-NEXT: s_clause 0x7
-; GFX12-NEXT: global_store_b128 v0, v[49:52], s[0:1] offset:144
-; GFX12-NEXT: global_store_b128 v0, v[53:56], s[0:1] offset:128
-; GFX12-NEXT: global_store_b128 v0, v[45:48], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v0, v[41:44], s[0:1] offset:96
-; GFX12-NEXT: global_store_b128 v0, v[37:40], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v0, v[33:36], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v0, v[29:32], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v0, v[25:28], s[0:1] offset:32
-; GFX12-NEXT: v_mov_b32_e32 v10, s3
+; GFX12-NEXT: global_store_b128 v0, v[49:52], s[4:5] offset:144
+; GFX12-NEXT: global_store_b128 v0, v[53:56], s[4:5] offset:128
+; GFX12-NEXT: global_store_b128 v0, v[45:48], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v0, v[41:44], s[4:5] offset:96
+; GFX12-NEXT: global_store_b128 v0, v[37:40], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v0, v[33:36], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v0, v[29:32], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v0, v[25:28], s[4:5] offset:32
+; GFX12-NEXT: v_mov_b32_e32 v10, s1
; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1]
-; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:240
-; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:224
-; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:208
-; GFX12-NEXT: global_store_b128 v0, v[9:12], s[0:1] offset:192
+; GFX12-NEXT: global_store_b128 v0, v[5:8], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v0, v[1:4], s[4:5]
+; GFX12-NEXT: global_store_b128 v0, v[21:24], s[4:5] offset:240
+; GFX12-NEXT: global_store_b128 v0, v[17:20], s[4:5] offset:224
+; GFX12-NEXT: global_store_b128 v0, v[13:16], s[4:5] offset:208
+; GFX12-NEXT: global_store_b128 v0, v[9:12], s[4:5] offset:192
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -7783,159 +7783,159 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_zextload_v64i1_to_v64i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s0
; GFX8-NEXT: v_and_b32_e32 v18, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 11, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 11, s0
; GFX8-NEXT: v_and_b32_e32 v16, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 9, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 9, s0
; GFX8-NEXT: v_and_b32_e32 v15, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 7, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 7, s0
; GFX8-NEXT: v_and_b32_e32 v13, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 5, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 5, s0
; GFX8-NEXT: v_and_b32_e32 v10, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 3, s2
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 14, s2
-; GFX8-NEXT: s_lshr_b32 s33, s3, 24
-; GFX8-NEXT: s_lshr_b32 s24, s2, 24
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 12, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v17, 10, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 3, s0
+; GFX8-NEXT: v_mov_b32_e32 v12, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 14, s0
+; GFX8-NEXT: s_lshr_b32 s33, s1, 24
+; GFX8-NEXT: s_lshr_b32 s24, s0, 24
+; GFX8-NEXT: v_lshrrev_b16_e64 v19, 12, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v17, 10, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s0
; GFX8-NEXT: v_and_b32_e32 v8, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v9, 2, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 1, s2
-; GFX8-NEXT: s_bfe_u32 s20, s2, 0x10018
-; GFX8-NEXT: s_bfe_u32 s21, s3, 0x10018
-; GFX8-NEXT: s_and_b32 s22, s3, 1
-; GFX8-NEXT: s_and_b32 s23, s2, 1
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 15, s2
-; GFX8-NEXT: s_bfe_u32 s25, s2, 0x10011
-; GFX8-NEXT: s_bfe_u32 s26, s2, 0x10010
-; GFX8-NEXT: s_bfe_u32 s27, s2, 0x10012
-; GFX8-NEXT: s_bfe_u32 s28, s2, 0x10013
-; GFX8-NEXT: s_bfe_u32 s29, s2, 0x10014
-; GFX8-NEXT: s_bfe_u32 s30, s2, 0x10015
-; GFX8-NEXT: s_bfe_u32 s31, s2, 0x10016
-; GFX8-NEXT: s_bfe_u32 s2, s2, 0x10017
-; GFX8-NEXT: s_bfe_u32 s34, s3, 0x10011
-; GFX8-NEXT: s_bfe_u32 s35, s3, 0x10010
-; GFX8-NEXT: s_bfe_u32 s36, s3, 0x10012
-; GFX8-NEXT: s_bfe_u32 s37, s3, 0x10013
-; GFX8-NEXT: s_bfe_u32 s38, s3, 0x10016
-; GFX8-NEXT: s_bfe_u32 s39, s3, 0x10017
-; GFX8-NEXT: s_bfe_u32 s40, s3, 0x10015
-; GFX8-NEXT: s_bfe_u32 s41, s3, 0x10014
-; GFX8-NEXT: s_add_u32 s4, s0, 0x1a0
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: s_add_u32 s6, s0, 0x1b0
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
-; GFX8-NEXT: s_add_u32 s8, s0, 0x190
-; GFX8-NEXT: s_addc_u32 s9, s1, 0
-; GFX8-NEXT: s_add_u32 s10, s0, 0x180
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
-; GFX8-NEXT: s_add_u32 s12, s0, 0xb0
-; GFX8-NEXT: s_addc_u32 s13, s1, 0
-; GFX8-NEXT: s_add_u32 s14, s0, 0xa0
-; GFX8-NEXT: s_addc_u32 s15, s1, 0
-; GFX8-NEXT: s_add_u32 s16, s0, 0x90
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
-; GFX8-NEXT: s_add_u32 s18, s0, 0x80
-; GFX8-NEXT: s_addc_u32 s19, s1, 0
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s3
-; GFX8-NEXT: s_add_u32 s42, s0, 0x70
+; GFX8-NEXT: v_lshrrev_b16_e64 v9, 2, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 1, s0
+; GFX8-NEXT: s_bfe_u32 s20, s0, 0x10018
+; GFX8-NEXT: s_bfe_u32 s21, s1, 0x10018
+; GFX8-NEXT: s_and_b32 s22, s1, 1
+; GFX8-NEXT: s_and_b32 s23, s0, 1
+; GFX8-NEXT: v_lshrrev_b16_e64 v4, 15, s0
+; GFX8-NEXT: s_bfe_u32 s25, s0, 0x10011
+; GFX8-NEXT: s_bfe_u32 s26, s0, 0x10010
+; GFX8-NEXT: s_bfe_u32 s27, s0, 0x10012
+; GFX8-NEXT: s_bfe_u32 s28, s0, 0x10013
+; GFX8-NEXT: s_bfe_u32 s29, s0, 0x10014
+; GFX8-NEXT: s_bfe_u32 s30, s0, 0x10015
+; GFX8-NEXT: s_bfe_u32 s31, s0, 0x10016
+; GFX8-NEXT: s_bfe_u32 s0, s0, 0x10017
+; GFX8-NEXT: s_bfe_u32 s34, s1, 0x10011
+; GFX8-NEXT: s_bfe_u32 s35, s1, 0x10010
+; GFX8-NEXT: s_bfe_u32 s36, s1, 0x10012
+; GFX8-NEXT: s_bfe_u32 s37, s1, 0x10013
+; GFX8-NEXT: s_bfe_u32 s38, s1, 0x10016
+; GFX8-NEXT: s_bfe_u32 s39, s1, 0x10017
+; GFX8-NEXT: s_bfe_u32 s40, s1, 0x10015
+; GFX8-NEXT: s_bfe_u32 s41, s1, 0x10014
+; GFX8-NEXT: s_add_u32 s2, s4, 0x1a0
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: s_add_u32 s6, s4, 0x1b0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
+; GFX8-NEXT: s_add_u32 s8, s4, 0x190
+; GFX8-NEXT: s_addc_u32 s9, s5, 0
+; GFX8-NEXT: s_add_u32 s10, s4, 0x180
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
+; GFX8-NEXT: s_add_u32 s12, s4, 0xb0
+; GFX8-NEXT: s_addc_u32 s13, s5, 0
+; GFX8-NEXT: s_add_u32 s14, s4, 0xa0
+; GFX8-NEXT: s_addc_u32 s15, s5, 0
+; GFX8-NEXT: s_add_u32 s16, s4, 0x90
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
+; GFX8-NEXT: s_add_u32 s18, s4, 0x80
+; GFX8-NEXT: s_addc_u32 s19, s5, 0
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s1
+; GFX8-NEXT: s_add_u32 s42, s4, 0x70
; GFX8-NEXT: v_and_b32_e32 v7, 1, v1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v23, s42
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v24, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x170
-; GFX8-NEXT: v_lshrrev_b16_e64 v22, 14, s3
+; GFX8-NEXT: s_add_u32 s42, s4, 0x170
+; GFX8-NEXT: v_lshrrev_b16_e64 v22, 14, s1
; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[2:5]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s42
; GFX8-NEXT: v_and_b32_e32 v22, 1, v22
-; GFX8-NEXT: v_lshrrev_b16_e64 v24, 15, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v24, 15, s1
; GFX8-NEXT: v_mov_b32_e32 v23, v1
; GFX8-NEXT: v_mov_b32_e32 v25, v1
; GFX8-NEXT: v_mov_b32_e32 v3, s43
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[22:25]
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s3
-; GFX8-NEXT: s_add_u32 s42, s0, 0x1f0
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s1
+; GFX8-NEXT: s_add_u32 s42, s4, 0x1f0
; GFX8-NEXT: v_lshrrev_b16_e64 v21, 6, s33
; GFX8-NEXT: v_and_b32_e32 v4, 1, v2
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s42
; GFX8-NEXT: v_and_b32_e32 v21, 1, v21
; GFX8-NEXT: v_lshrrev_b16_e64 v23, 7, s33
; GFX8-NEXT: v_mov_b32_e32 v22, v1
; GFX8-NEXT: v_mov_b32_e32 v24, v1
; GFX8-NEXT: v_mov_b32_e32 v3, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0xf0
+; GFX8-NEXT: s_add_u32 s42, s4, 0xf0
; GFX8-NEXT: v_lshrrev_b16_e64 v20, 6, s24
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[21:24]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s42
; GFX8-NEXT: v_and_b32_e32 v22, 1, v20
; GFX8-NEXT: v_lshrrev_b16_e64 v24, 7, s24
; GFX8-NEXT: v_mov_b32_e32 v23, v1
; GFX8-NEXT: v_mov_b32_e32 v3, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x60
+; GFX8-NEXT: s_add_u32 s42, s4, 0x60
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[22:25]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_and_b32_e32 v22, 1, v19
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v18
; GFX8-NEXT: v_mov_b32_e32 v18, s42
; GFX8-NEXT: v_mov_b32_e32 v19, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x50
+; GFX8-NEXT: s_add_u32 s42, s4, 0x50
; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[22:25]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_and_b32_e32 v22, 1, v17
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v16
; GFX8-NEXT: v_mov_b32_e32 v16, s42
; GFX8-NEXT: v_mov_b32_e32 v17, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 64
+; GFX8-NEXT: s_add_u32 s42, s4, 64
; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v17, 1
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v26, s42
; GFX8-NEXT: v_and_b32_sdwa v22, v12, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v15
; GFX8-NEXT: v_mov_b32_e32 v27, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 48
+; GFX8-NEXT: s_add_u32 s42, s4, 48
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_and_b32_e32 v22, 1, v14
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v13
; GFX8-NEXT: v_mov_b32_e32 v13, s42
; GFX8-NEXT: v_mov_b32_e32 v14, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 32
+; GFX8-NEXT: s_add_u32 s42, s4, 32
; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[22:25]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_and_b32_e32 v22, 1, v11
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v10
; GFX8-NEXT: v_mov_b32_e32 v10, s42
; GFX8-NEXT: v_mov_b32_e32 v11, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 16
+; GFX8-NEXT: s_add_u32 s42, s4, 16
; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[22:25]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_and_b32_e32 v22, 1, v9
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v8
; GFX8-NEXT: v_mov_b32_e32 v8, s42
; GFX8-NEXT: v_mov_b32_e32 v9, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x160
-; GFX8-NEXT: v_lshrrev_b16_e64 v5, 12, s3
+; GFX8-NEXT: s_add_u32 s42, s4, 0x160
+; GFX8-NEXT: v_lshrrev_b16_e64 v5, 12, s1
; GFX8-NEXT: v_lshrrev_b16_e64 v10, 3, s33
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[22:25]
; GFX8-NEXT: v_lshrrev_b16_e64 v8, 1, s33
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v22, s42
; GFX8-NEXT: v_and_b32_e32 v28, 1, v10
; GFX8-NEXT: v_and_b32_e32 v19, 1, v8
@@ -7945,12 +7945,12 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v11, v1
; GFX8-NEXT: v_mov_b32_e32 v23, s43
; GFX8-NEXT: v_lshrrev_b16_e64 v5, 5, s24
-; GFX8-NEXT: s_add_u32 s42, s0, 0x150
-; GFX8-NEXT: v_lshrrev_b16_e64 v21, 10, s3
+; GFX8-NEXT: s_add_u32 s42, s4, 0x150
+; GFX8-NEXT: v_lshrrev_b16_e64 v21, 10, s1
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[8:11]
; GFX8-NEXT: v_and_b32_e32 v22, 1, v5
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v4
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v4, s42
; GFX8-NEXT: v_and_b32_e32 v7, 1, v21
; GFX8-NEXT: v_mov_b32_e32 v8, v1
@@ -7958,28 +7958,28 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v5, s43
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[7:10]
; GFX8-NEXT: v_lshrrev_b16_e64 v4, 3, s24
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 9, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 9, s1
; GFX8-NEXT: v_and_b32_e32 v10, 1, v4
; GFX8-NEXT: v_lshrrev_b16_e64 v4, 1, s24
-; GFX8-NEXT: s_add_u32 s42, s0, 0x140
-; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: s_add_u32 s42, s4, 0x140
+; GFX8-NEXT: v_mov_b32_e32 v6, s1
; GFX8-NEXT: v_and_b32_e32 v20, 1, v2
; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v8, s42
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 7, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v18, 6, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 7, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v18, 6, s1
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v4
; GFX8-NEXT: v_and_b32_sdwa v4, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v20
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v1
; GFX8-NEXT: v_mov_b32_e32 v9, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x130
+; GFX8-NEXT: s_add_u32 s42, s4, 0x130
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 5, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v3, 5, s1
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_and_b32_e32 v7, 1, v18
; GFX8-NEXT: v_mov_b32_e32 v17, s42
; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
@@ -7988,25 +7988,25 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v8, v1
; GFX8-NEXT: v_mov_b32_e32 v10, v1
; GFX8-NEXT: v_mov_b32_e32 v18, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x120
-; GFX8-NEXT: v_lshrrev_b16_e64 v16, 4, s3
+; GFX8-NEXT: s_add_u32 s42, s4, 0x120
+; GFX8-NEXT: v_lshrrev_b16_e64 v16, 4, s1
; GFX8-NEXT: flat_store_dwordx4 v[17:18], v[7:10]
; GFX8-NEXT: v_and_b32_e32 v18, 0xffff, v3
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s42
-; GFX8-NEXT: v_lshrrev_b16_e64 v12, 3, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v13, 1, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v12, 3, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v13, 1, s1
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v19
; GFX8-NEXT: v_and_b32_e32 v16, 1, v16
; GFX8-NEXT: v_mov_b32_e32 v17, v1
; GFX8-NEXT: v_mov_b32_e32 v19, v1
; GFX8-NEXT: v_mov_b32_e32 v3, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x110
+; GFX8-NEXT: s_add_u32 s42, s4, 0x110
; GFX8-NEXT: v_and_b32_e32 v12, 1, v12
-; GFX8-NEXT: v_lshrrev_b16_e64 v15, 2, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v15, 2, s1
; GFX8-NEXT: v_and_b32_e32 v13, 1, v13
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s42
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_and_b32_e32 v17, 1, v15
@@ -8015,13 +8015,13 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v20, v1
; GFX8-NEXT: v_mov_b32_e32 v3, s43
; GFX8-NEXT: v_and_b32_e32 v8, 0xffff, v13
-; GFX8-NEXT: v_mov_b32_e32 v13, s5
+; GFX8-NEXT: v_mov_b32_e32 v13, s3
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[17:20]
; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s41
; GFX8-NEXT: v_mov_b32_e32 v2, s40
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v12, s4
+; GFX8-NEXT: v_mov_b32_e32 v12, s2
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v13, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s38
@@ -8040,7 +8040,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v12, s12
; GFX8-NEXT: v_mov_b32_e32 v0, s31
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_mov_b32_e32 v13, s13
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v12, s14
@@ -8058,66 +8058,66 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v2, s25
; GFX8-NEXT: v_mov_b32_e32 v13, s19
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v13, s1
-; GFX8-NEXT: s_add_u32 s2, s0, 0x100
+; GFX8-NEXT: v_mov_b32_e32 v13, s5
+; GFX8-NEXT: s_add_u32 s0, s4, 0x100
; GFX8-NEXT: v_mov_b32_e32 v0, s23
; GFX8-NEXT: v_mov_b32_e32 v2, v10
-; GFX8-NEXT: v_mov_b32_e32 v12, s0
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v12, s4
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v13, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 5, s33
+; GFX8-NEXT: v_mov_b32_e32 v13, s1
+; GFX8-NEXT: v_mov_b32_e32 v12, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x1e0
; GFX8-NEXT: v_mov_b32_e32 v0, s22
; GFX8-NEXT: v_mov_b32_e32 v2, v8
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0x1e0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_lshrrev_b16_e64 v14, 5, s33
+; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; GFX8-NEXT: v_and_b32_e32 v26, 1, v14
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_lshrrev_b16_e64 v27, 4, s33
-; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x1d0
; GFX8-NEXT: v_and_b32_e32 v17, 1, v27
; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v26
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x1d0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[17:20]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x1c0
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x1c0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_lshrrev_b16_e64 v14, 2, s33
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v13, s1
; GFX8-NEXT: v_and_b32_e32 v14, 1, v14
; GFX8-NEXT: v_and_b32_e32 v16, 0xffff, v28
; GFX8-NEXT: v_mov_b32_e32 v15, v1
; GFX8-NEXT: v_mov_b32_e32 v17, v1
-; GFX8-NEXT: v_mov_b32_e32 v13, s3
+; GFX8-NEXT: v_mov_b32_e32 v12, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xe0
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[14:17]
; GFX8-NEXT: v_mov_b32_e32 v0, s21
; GFX8-NEXT: v_mov_b32_e32 v2, v5
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0xe0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_lshrrev_b16_e64 v23, 4, s24
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0xd0
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xd0
; GFX8-NEXT: v_and_b32_e32 v7, 1, v23
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v22
; GFX8-NEXT: v_mov_b32_e32 v8, v1
; GFX8-NEXT: v_mov_b32_e32 v10, v1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_lshrrev_b16_e64 v21, 2, s24
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[7:10]
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: s_add_u32 s0, s0, 0xc0
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v21, 2, s24
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xc0
; GFX8-NEXT: v_and_b32_e32 v4, 1, v21
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: v_mov_b32_e32 v5, s1
@@ -8434,58 +8434,58 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v64i1_to_v64i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v4, 11, s2
-; GFX12-NEXT: v_lshrrev_b16 v6, 7, s2
-; GFX12-NEXT: s_lshr_b32 s4, s3, 24
+; GFX12-NEXT: v_lshrrev_b16 v4, 11, s0
+; GFX12-NEXT: v_lshrrev_b16 v6, 7, s0
+; GFX12-NEXT: s_lshr_b32 s2, s1, 24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: v_lshrrev_b16 v14, 13, s3
+; GFX12-NEXT: v_lshrrev_b16 v14, 13, s1
; GFX12-NEXT: v_and_b32_e32 v34, 1, v4
-; GFX12-NEXT: v_lshrrev_b16 v18, 9, s3
+; GFX12-NEXT: v_lshrrev_b16 v18, 9, s1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-NEXT: v_dual_mov_b32 v28, v1 :: v_dual_and_b32 v41, 1, v6
-; GFX12-NEXT: v_lshrrev_b16 v4, 5, s4
-; GFX12-NEXT: v_lshrrev_b16 v6, 3, s4
-; GFX12-NEXT: s_lshr_b32 s5, s2, 24
-; GFX12-NEXT: v_lshrrev_b16 v3, 13, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v10, 3, s2
-; GFX12-NEXT: v_lshrrev_b16 v23, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v24, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v25, 3, s3
+; GFX12-NEXT: v_lshrrev_b16 v4, 5, s2
+; GFX12-NEXT: v_lshrrev_b16 v6, 3, s2
+; GFX12-NEXT: s_lshr_b32 s3, s0, 24
+; GFX12-NEXT: v_lshrrev_b16 v3, 13, s0
+; GFX12-NEXT: v_lshrrev_b16 v8, 5, s0
+; GFX12-NEXT: v_lshrrev_b16 v10, 3, s0
+; GFX12-NEXT: v_lshrrev_b16 v23, 7, s1
+; GFX12-NEXT: v_lshrrev_b16 v24, 5, s1
+; GFX12-NEXT: v_lshrrev_b16 v25, 3, s1
; GFX12-NEXT: v_and_b32_e32 v50, 1, v14
; GFX12-NEXT: v_and_b32_e32 v47, 1, v18
; GFX12-NEXT: v_and_b32_e32 v18, 1, v4
; GFX12-NEXT: v_and_b32_e32 v14, 1, v6
-; GFX12-NEXT: v_lshrrev_b16 v4, 3, s5
-; GFX12-NEXT: v_lshrrev_b16 v6, 5, s5
-; GFX12-NEXT: v_lshrrev_b16 v0, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v2, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 12, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 9, s2
-; GFX12-NEXT: v_lshrrev_b16 v12, 1, s2
+; GFX12-NEXT: v_lshrrev_b16 v4, 3, s3
+; GFX12-NEXT: v_lshrrev_b16 v6, 5, s3
+; GFX12-NEXT: v_lshrrev_b16 v0, 15, s0
+; GFX12-NEXT: v_lshrrev_b16 v2, 14, s0
+; GFX12-NEXT: v_lshrrev_b16 v7, 12, s0
+; GFX12-NEXT: v_lshrrev_b16 v5, 9, s0
+; GFX12-NEXT: v_lshrrev_b16 v12, 1, s0
; GFX12-NEXT: v_and_b32_e32 v3, 1, v3
; GFX12-NEXT: v_and_b32_e32 v42, 1, v8
; GFX12-NEXT: v_and_b32_e32 v52, 1, v10
; GFX12-NEXT: v_and_b32_e32 v40, 1, v23
; GFX12-NEXT: v_dual_mov_b32 v44, v1 :: v_dual_and_b32 v43, 1, v24
-; GFX12-NEXT: v_lshrrev_b16 v8, 1, s5
-; GFX12-NEXT: v_lshrrev_b16 v10, 2, s5
-; GFX12-NEXT: v_lshrrev_b16 v24, 4, s5
-; GFX12-NEXT: s_bfe_u32 s7, s3, 0x10014
+; GFX12-NEXT: v_lshrrev_b16 v8, 1, s3
+; GFX12-NEXT: v_lshrrev_b16 v10, 2, s3
+; GFX12-NEXT: v_lshrrev_b16 v24, 4, s3
+; GFX12-NEXT: s_bfe_u32 s7, s1, 0x10014
; GFX12-NEXT: v_and_b32_e32 v33, 1, v25
; GFX12-NEXT: v_and_b32_e32 v25, 1, v6
-; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10015
+; GFX12-NEXT: s_bfe_u32 s8, s1, 0x10015
; GFX12-NEXT: v_and_b32_e32 v23, 1, v4
-; GFX12-NEXT: v_lshrrev_b16 v11, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v16, 11, s3
+; GFX12-NEXT: v_lshrrev_b16 v11, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v16, 11, s1
; GFX12-NEXT: v_dual_mov_b32 v26, v1 :: v_dual_and_b32 v35, 1, v5
; GFX12-NEXT: v_dual_mov_b32 v30, v1 :: v_dual_and_b32 v5, 1, v12
-; GFX12-NEXT: v_lshrrev_b16 v36, 7, s5
-; GFX12-NEXT: v_lshrrev_b16 v37, 6, s5
+; GFX12-NEXT: v_lshrrev_b16 v36, 7, s3
+; GFX12-NEXT: v_lshrrev_b16 v37, 6, s3
; GFX12-NEXT: v_and_b32_e32 v56, 1, v8
; GFX12-NEXT: v_and_b32_e32 v4, 1, v10
; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v23
@@ -8494,16 +8494,16 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v23, 1, v2
; GFX12-NEXT: v_dual_mov_b32 v24, v1 :: v_dual_and_b32 v25, 0xffff, v0
; GFX12-NEXT: v_and_b32_e32 v29, 0xffff, v3
-; GFX12-NEXT: s_bfe_u32 s9, s3, 0x10013
+; GFX12-NEXT: s_bfe_u32 s9, s1, 0x10013
; GFX12-NEXT: v_and_b32_e32 v27, 1, v7
-; GFX12-NEXT: v_lshrrev_b16 v9, 10, s2
-; GFX12-NEXT: v_lshrrev_b16 v13, 6, s2
+; GFX12-NEXT: v_lshrrev_b16 v9, 10, s0
+; GFX12-NEXT: v_lshrrev_b16 v13, 6, s0
; GFX12-NEXT: v_and_b32_e32 v22, 1, v16
-; GFX12-NEXT: v_lshrrev_b16 v54, 1, s3
-; GFX12-NEXT: v_lshrrev_b16 v55, 1, s4
+; GFX12-NEXT: v_lshrrev_b16 v54, 1, s1
+; GFX12-NEXT: v_lshrrev_b16 v55, 1, s2
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v1, v[23:26], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v1, v[27:30], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v1, v[23:26], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v1, v[27:30], s[4:5] offset:96
; GFX12-NEXT: v_and_b32_e32 v23, 1, v37
; GFX12-NEXT: v_and_b32_e32 v25, 0xffff, v36
; GFX12-NEXT: v_dual_mov_b32 v57, v1 :: v_dual_and_b32 v28, 0xffff, v34
@@ -8512,91 +8512,91 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v37, v1 :: v_dual_and_b32 v26, 1, v9
; GFX12-NEXT: v_mov_b32_e32 v27, v1
; GFX12-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_and_b32 v0, 1, v55
-; GFX12-NEXT: global_store_b128 v1, v[34:37], s[0:1] offset:64
+; GFX12-NEXT: global_store_b128 v1, v[34:37], s[4:5] offset:64
; GFX12-NEXT: v_and_b32_e32 v34, 1, v13
; GFX12-NEXT: v_and_b32_e32 v36, 0xffff, v41
; GFX12-NEXT: v_and_b32_e32 v2, 1, v54
-; GFX12-NEXT: global_store_b128 v1, v[26:29], s[0:1] offset:80
+; GFX12-NEXT: global_store_b128 v1, v[26:29], s[4:5] offset:80
; GFX12-NEXT: v_and_b32_e32 v30, 0xffff, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s7
-; GFX12-NEXT: global_store_b128 v1, v[34:37], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v1, v[34:37], s[4:5] offset:48
; GFX12-NEXT: v_and_b32_e32 v36, 0xffff, v2
; GFX12-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_bfe_u32 s7, s3, 0x10016
-; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10017
-; GFX12-NEXT: v_lshrrev_b16 v20, 15, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 14, s3
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:416
+; GFX12-NEXT: s_bfe_u32 s7, s1, 0x10016
+; GFX12-NEXT: s_bfe_u32 s8, s1, 0x10017
+; GFX12-NEXT: v_lshrrev_b16 v20, 15, s1
+; GFX12-NEXT: v_lshrrev_b16 v21, 14, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:416
; GFX12-NEXT: v_mov_b32_e32 v0, s7
; GFX12-NEXT: v_mov_b32_e32 v2, s8
-; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10012
-; GFX12-NEXT: v_lshrrev_b16 v19, 12, s3
-; GFX12-NEXT: v_lshrrev_b16 v32, 8, s3
-; GFX12-NEXT: v_lshrrev_b16 v38, 6, s3
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:432
+; GFX12-NEXT: s_bfe_u32 s8, s1, 0x10012
+; GFX12-NEXT: v_lshrrev_b16 v19, 12, s1
+; GFX12-NEXT: v_lshrrev_b16 v32, 8, s1
+; GFX12-NEXT: v_lshrrev_b16 v38, 6, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:432
; GFX12-NEXT: v_mov_b32_e32 v0, s8
; GFX12-NEXT: v_mov_b32_e32 v2, s9
-; GFX12-NEXT: v_lshrrev_b16 v39, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v31, 2, s3
-; GFX12-NEXT: v_lshrrev_b16 v28, 10, s3
-; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10018
-; GFX12-NEXT: s_and_b32 s6, s3, 1
-; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10011
-; GFX12-NEXT: s_bfe_u32 s3, s3, 0x10010
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:400
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: v_lshrrev_b16 v39, 4, s1
+; GFX12-NEXT: v_lshrrev_b16 v31, 2, s1
+; GFX12-NEXT: v_lshrrev_b16 v28, 10, s1
+; GFX12-NEXT: s_bfe_u32 s3, s1, 0x10018
+; GFX12-NEXT: s_and_b32 s6, s1, 1
+; GFX12-NEXT: s_bfe_u32 s8, s1, 0x10011
+; GFX12-NEXT: s_bfe_u32 s1, s1, 0x10010
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:400
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s8
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10016
-; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10017
-; GFX12-NEXT: v_lshrrev_b16 v15, 4, s2
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10016
+; GFX12-NEXT: s_bfe_u32 s8, s0, 0x10017
+; GFX12-NEXT: v_lshrrev_b16 v15, 4, s0
; GFX12-NEXT: v_and_b32_e32 v31, 1, v31
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:384
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:384
; GFX12-NEXT: v_mov_b32_e32 v2, s8
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10014
-; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10015
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10014
+; GFX12-NEXT: s_bfe_u32 s8, s0, 0x10015
; GFX12-NEXT: v_and_b32_e32 v29, 0xffff, v43
; GFX12-NEXT: v_and_b32_e32 v41, 1, v15
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:176
; GFX12-NEXT: v_mov_b32_e32 v2, s8
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10012
-; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10013
-; GFX12-NEXT: v_lshrrev_b16 v17, 2, s2
-; GFX12-NEXT: v_lshrrev_b16 v46, 7, s4
-; GFX12-NEXT: v_lshrrev_b16 v49, 6, s4
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10012
+; GFX12-NEXT: s_bfe_u32 s8, s0, 0x10013
+; GFX12-NEXT: v_lshrrev_b16 v17, 2, s0
+; GFX12-NEXT: v_lshrrev_b16 v46, 7, s2
+; GFX12-NEXT: v_lshrrev_b16 v49, 6, s2
; GFX12-NEXT: v_dual_mov_b32 v26, v1 :: v_dual_and_b32 v43, 0xffff, v42
; GFX12-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_and_b32 v45, 1, v32
; GFX12-NEXT: v_and_b32_e32 v47, 0xffff, v47
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:160
; GFX12-NEXT: v_mov_b32_e32 v2, s8
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_lshrrev_b16 v16, 4, s4
-; GFX12-NEXT: v_lshrrev_b16 v12, 2, s4
-; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018
-; GFX12-NEXT: s_and_b32 s7, s2, 1
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10011
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10010
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: v_lshrrev_b16 v16, 4, s2
+; GFX12-NEXT: v_lshrrev_b16 v12, 2, s2
+; GFX12-NEXT: s_bfe_u32 s2, s0, 0x10018
+; GFX12-NEXT: s_and_b32 s7, s0, 1
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10011
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x10010
; GFX12-NEXT: v_and_b32_e32 v51, 1, v17
; GFX12-NEXT: v_dual_mov_b32 v54, v1 :: v_dual_and_b32 v53, 0xffff, v52
; GFX12-NEXT: v_and_b32_e32 v37, 0xffff, v5
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: v_mov_b32_e32 v2, s3
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:144
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
; GFX12-NEXT: v_mov_b32_e32 v52, v1
-; GFX12-NEXT: global_store_b128 v1, v[41:44], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v1, v[41:44], s[4:5] offset:32
; GFX12-NEXT: v_and_b32_e32 v41, 1, v49
; GFX12-NEXT: v_and_b32_e32 v43, 0xffff, v46
; GFX12-NEXT: v_mov_b32_e32 v13, v1
; GFX12-NEXT: v_and_b32_e32 v35, 0xffff, v56
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:128
; GFX12-NEXT: v_mov_b32_e32 v0, s7
; GFX12-NEXT: v_mov_b32_e32 v46, v1
; GFX12-NEXT: v_mov_b32_e32 v2, v37
; GFX12-NEXT: v_dual_mov_b32 v55, v1 :: v_dual_and_b32 v16, 1, v16
; GFX12-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_and_b32 v48, 1, v19
; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX12-NEXT: global_store_b128 v1, v[51:54], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v1, v[51:54], s[4:5] offset:16
; GFX12-NEXT: v_dual_mov_b32 v53, v1 :: v_dual_and_b32 v52, 1, v21
; GFX12-NEXT: v_and_b32_e32 v54, 0xffff, v20
; GFX12-NEXT: v_dual_mov_b32 v17, v1 :: v_dual_and_b32 v50, 0xffff, v50
@@ -8608,40 +8608,40 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v56, 1, v28
; GFX12-NEXT: v_and_b32_e32 v58, 0xffff, v22
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT: global_store_b128 v1, v[41:44], s[0:1] offset:496
-; GFX12-NEXT: global_store_b128 v1, v[52:55], s[0:1] offset:368
-; GFX12-NEXT: global_store_b128 v1, v[48:51], s[0:1] offset:352
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
+; GFX12-NEXT: global_store_b128 v1, v[41:44], s[4:5] offset:496
+; GFX12-NEXT: global_store_b128 v1, v[52:55], s[4:5] offset:368
+; GFX12-NEXT: global_store_b128 v1, v[48:51], s[4:5] offset:352
; GFX12-NEXT: v_mov_b32_e32 v41, v1
; GFX12-NEXT: v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v0, s6
; GFX12-NEXT: v_mov_b32_e32 v2, v36
; GFX12-NEXT: v_dual_mov_b32 v48, v1 :: v_dual_and_b32 v33, 0xffff, v33
; GFX12-NEXT: v_mov_b32_e32 v32, v1
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v1, v[56:59], s[0:1] offset:336
-; GFX12-NEXT: global_store_b128 v1, v[45:48], s[0:1] offset:320
-; GFX12-NEXT: global_store_b128 v1, v[38:41], s[0:1] offset:304
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:256
-; GFX12-NEXT: v_mov_b32_e32 v0, s5
+; GFX12-NEXT: global_store_b128 v1, v[56:59], s[4:5] offset:336
+; GFX12-NEXT: global_store_b128 v1, v[45:48], s[4:5] offset:320
+; GFX12-NEXT: global_store_b128 v1, v[38:41], s[4:5] offset:304
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:256
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, v30
; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v12, 1, v12
; GFX12-NEXT: v_mov_b32_e32 v15, v1
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v1, v[31:34], s[0:1] offset:272
-; GFX12-NEXT: global_store_b128 v1, v[23:26], s[0:1] offset:240
-; GFX12-NEXT: global_store_b128 v1, v[16:19], s[0:1] offset:480
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:448
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: global_store_b128 v1, v[31:34], s[4:5] offset:272
+; GFX12-NEXT: global_store_b128 v1, v[23:26], s[4:5] offset:240
+; GFX12-NEXT: global_store_b128 v1, v[16:19], s[4:5] offset:480
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:448
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_dual_mov_b32 v2, v35 :: v_dual_mov_b32 v9, v1
; GFX12-NEXT: v_mov_b32_e32 v11, v1
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v28, v1
; GFX12-NEXT: v_mov_b32_e32 v30, v1
; GFX12-NEXT: s_clause 0x4
-; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:464
-; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:224
-; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:208
-; GFX12-NEXT: global_store_b128 v1, v[27:30], s[0:1] offset:288
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192
+; GFX12-NEXT: global_store_b128 v1, v[12:15], s[4:5] offset:464
+; GFX12-NEXT: global_store_b128 v1, v[8:11], s[4:5] offset:224
+; GFX12-NEXT: global_store_b128 v1, v[4:7], s[4:5] offset:208
+; GFX12-NEXT: global_store_b128 v1, v[27:30], s[4:5] offset:288
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:192
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -8977,13 +8977,13 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_sextload_v64i1_to_v64i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0
; GFX8-NEXT: s_mov_b32 s7, 0
; GFX8-NEXT: s_mov_b32 s13, s7
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v29, s1
-; GFX8-NEXT: v_mov_b32_e32 v28, s0
+; GFX8-NEXT: v_mov_b32_e32 v29, s5
+; GFX8-NEXT: v_mov_b32_e32 v28, s4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshr_b32 s16, s11, 22
; GFX8-NEXT: s_lshr_b32 s18, s11, 23
@@ -9004,8 +9004,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: s_mov_b32 s6, s11
; GFX8-NEXT: s_lshr_b32 s12, s11, 24
; GFX8-NEXT: s_lshr_b32 s8, s10, 24
-; GFX8-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x10000
-; GFX8-NEXT: s_bfe_i64 s[4:5], s[12:13], 0x10000
+; GFX8-NEXT: s_bfe_i64 s[0:1], s[8:9], 0x10000
+; GFX8-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x10000
; GFX8-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
; GFX8-NEXT: s_bfe_i64 s[14:15], s[10:11], 0x10000
; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000
@@ -9025,91 +9025,91 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
; GFX8-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
; GFX8-NEXT: v_mov_b32_e32 v22, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x1b0
+; GFX8-NEXT: s_add_u32 s16, s4, 0x1b0
; GFX8-NEXT: v_mov_b32_e32 v23, s17
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v27, s17
; GFX8-NEXT: v_mov_b32_e32 v26, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x1a0
+; GFX8-NEXT: s_add_u32 s16, s4, 0x1a0
; GFX8-NEXT: v_mov_b32_e32 v24, s18
; GFX8-NEXT: v_mov_b32_e32 v25, s19
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s17
; GFX8-NEXT: v_mov_b32_e32 v26, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x190
+; GFX8-NEXT: s_add_u32 s16, s4, 0x190
; GFX8-NEXT: v_mov_b32_e32 v22, s20
; GFX8-NEXT: v_mov_b32_e32 v23, s21
; GFX8-NEXT: v_mov_b32_e32 v24, s22
; GFX8-NEXT: v_mov_b32_e32 v25, s23
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s17
; GFX8-NEXT: v_mov_b32_e32 v26, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x180
+; GFX8-NEXT: s_add_u32 s16, s4, 0x180
; GFX8-NEXT: v_mov_b32_e32 v22, s24
; GFX8-NEXT: v_mov_b32_e32 v23, s25
; GFX8-NEXT: v_mov_b32_e32 v24, s26
; GFX8-NEXT: v_mov_b32_e32 v25, s27
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s17
; GFX8-NEXT: v_mov_b32_e32 v26, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0xb0
+; GFX8-NEXT: s_add_u32 s16, s4, 0xb0
; GFX8-NEXT: v_mov_b32_e32 v22, s28
; GFX8-NEXT: v_mov_b32_e32 v23, s29
; GFX8-NEXT: v_mov_b32_e32 v24, s30
; GFX8-NEXT: v_mov_b32_e32 v25, s31
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s17
; GFX8-NEXT: v_mov_b32_e32 v26, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0xa0
+; GFX8-NEXT: s_add_u32 s16, s4, 0xa0
; GFX8-NEXT: v_mov_b32_e32 v22, s34
; GFX8-NEXT: v_mov_b32_e32 v23, s35
; GFX8-NEXT: v_mov_b32_e32 v24, s36
; GFX8-NEXT: v_mov_b32_e32 v25, s37
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s17
; GFX8-NEXT: v_mov_b32_e32 v26, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x90
+; GFX8-NEXT: s_add_u32 s16, s4, 0x90
; GFX8-NEXT: v_mov_b32_e32 v22, s38
; GFX8-NEXT: v_mov_b32_e32 v23, s39
; GFX8-NEXT: v_mov_b32_e32 v24, s40
; GFX8-NEXT: v_mov_b32_e32 v25, s41
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s17
; GFX8-NEXT: v_mov_b32_e32 v26, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x80
+; GFX8-NEXT: s_add_u32 s16, s4, 0x80
; GFX8-NEXT: v_mov_b32_e32 v22, s42
; GFX8-NEXT: v_mov_b32_e32 v23, s43
; GFX8-NEXT: v_mov_b32_e32 v24, s44
; GFX8-NEXT: v_mov_b32_e32 v25, s45
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s17
; GFX8-NEXT: v_mov_b32_e32 v26, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x70
+; GFX8-NEXT: s_add_u32 s16, s4, 0x70
; GFX8-NEXT: v_lshrrev_b16_e64 v20, 14, s10
; GFX8-NEXT: v_lshrrev_b16_e64 v21, 15, s10
; GFX8-NEXT: v_mov_b32_e32 v22, s46
; GFX8-NEXT: v_mov_b32_e32 v23, s47
; GFX8-NEXT: v_mov_b32_e32 v24, s48
; GFX8-NEXT: v_mov_b32_e32 v25, s49
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_bfe_i32 v26, v21, 0, 1
; GFX8-NEXT: v_bfe_i32 v24, v20, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v21, s17
; GFX8-NEXT: v_mov_b32_e32 v20, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x60
+; GFX8-NEXT: s_add_u32 s16, s4, 0x60
; GFX8-NEXT: v_lshrrev_b16_e64 v18, 12, s10
; GFX8-NEXT: v_lshrrev_b16_e64 v19, 13, s10
; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[24:27]
; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s10
; GFX8-NEXT: v_bfe_i32 v26, v19, 0, 1
@@ -9119,9 +9119,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GFX8-NEXT: v_mov_b32_e32 v18, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x50
+; GFX8-NEXT: s_add_u32 s16, s4, 0x50
; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[24:27]
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: v_bfe_i32 v26, v17, 0, 1
; GFX8-NEXT: v_bfe_i32 v24, v16, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v16, s16
@@ -9137,7 +9137,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GFX8-NEXT: v_mov_b32_e32 v17, s17
-; GFX8-NEXT: s_add_u32 s10, s0, 64
+; GFX8-NEXT: s_add_u32 s10, s4, 64
; GFX8-NEXT: v_lshrrev_b16_e64 v5, 14, s11
; GFX8-NEXT: v_lshrrev_b16_e64 v6, 15, s11
; GFX8-NEXT: v_lshrrev_b16_e64 v3, 12, s11
@@ -9154,15 +9154,15 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[24:27]
; GFX8-NEXT: v_lshrrev_b16_e64 v17, 3, s11
; GFX8-NEXT: v_lshrrev_b16_e64 v16, 1, s11
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: v_bfe_i32 v26, v15, 0, 1
; GFX8-NEXT: v_bfe_i32 v24, v14, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v15, s11
; GFX8-NEXT: v_mov_b32_e32 v14, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 48
+; GFX8-NEXT: s_add_u32 s10, s4, 48
; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[24:27]
; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 1
; GFX8-NEXT: v_bfe_i32 v26, v13, 0, 1
@@ -9171,18 +9171,18 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GFX8-NEXT: v_mov_b32_e32 v12, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 32
+; GFX8-NEXT: s_add_u32 s10, s4, 32
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[24:27]
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: v_bfe_i32 v26, v11, 0, 1
; GFX8-NEXT: v_bfe_i32 v24, v10, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v10, s10
; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GFX8-NEXT: v_mov_b32_e32 v11, s11
-; GFX8-NEXT: s_add_u32 s10, s0, 16
+; GFX8-NEXT: s_add_u32 s10, s4, 16
; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[24:27]
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: v_bfe_i32 v26, v9, 0, 1
; GFX8-NEXT: v_bfe_i32 v24, v8, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v8, s10
@@ -9190,32 +9190,32 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GFX8-NEXT: v_mov_b32_e32 v9, s11
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[24:27]
-; GFX8-NEXT: s_add_u32 s10, s0, 0x170
+; GFX8-NEXT: s_add_u32 s10, s4, 0x170
; GFX8-NEXT: v_bfe_i32 v26, v7, 0, 1
; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GFX8-NEXT: v_mov_b32_e32 v24, s14
; GFX8-NEXT: v_mov_b32_e32 v25, s15
; GFX8-NEXT: flat_store_dwordx4 v[28:29], v[24:27]
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: v_bfe_i32 v26, v6, 0, 1
; GFX8-NEXT: v_bfe_i32 v24, v5, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v5, s10
; GFX8-NEXT: v_mov_b32_e32 v6, s11
-; GFX8-NEXT: s_add_u32 s10, s0, 0x160
+; GFX8-NEXT: s_add_u32 s10, s4, 0x160
; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[5:6], v[24:27]
; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v25, s11
; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX8-NEXT: v_mov_b32_e32 v24, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 0x150
+; GFX8-NEXT: s_add_u32 s10, s4, 0x150
; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[3:6]
; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 1
; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 1
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s10
; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1
@@ -9228,39 +9228,39 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_lshrrev_b16_e64 v25, 3, s8
; GFX8-NEXT: flat_store_dwordx4 v[5:6], v[1:4]
; GFX8-NEXT: v_lshrrev_b16_e64 v6, 1, s8
-; GFX8-NEXT: s_add_u32 s8, s0, 0x140
+; GFX8-NEXT: s_add_u32 s8, s4, 0x140
; GFX8-NEXT: v_bfe_i32 v2, v23, 0, 1
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1
-; GFX8-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NEXT: s_addc_u32 s9, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v4, s8
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX8-NEXT: v_mov_b32_e32 v5, s9
-; GFX8-NEXT: s_add_u32 s8, s0, 0x130
+; GFX8-NEXT: s_add_u32 s8, s4, 0x130
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_bfe_i32 v4, v22, 0, 1
; GFX8-NEXT: v_bfe_i32 v2, v6, 0, 1
; GFX8-NEXT: v_bfe_i32 v6, v21, 0, 1
-; GFX8-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NEXT: s_addc_u32 s9, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX8-NEXT: v_mov_b32_e32 v1, s9
-; GFX8-NEXT: s_add_u32 s8, s0, 0x120
+; GFX8-NEXT: s_add_u32 s8, s4, 0x120
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NEXT: v_bfe_i32 v21, v19, 0, 1
; GFX8-NEXT: v_bfe_i32 v19, v20, 0, 1
-; GFX8-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NEXT: s_addc_u32 s9, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_ashrrev_i32_e32 v22, 31, v21
; GFX8-NEXT: v_ashrrev_i32_e32 v20, 31, v19
; GFX8-NEXT: v_mov_b32_e32 v1, s9
-; GFX8-NEXT: s_add_u32 s8, s0, 0x110
+; GFX8-NEXT: s_add_u32 s8, s4, 0x110
; GFX8-NEXT: v_bfe_i32 v6, v25, 0, 1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[19:22]
; GFX8-NEXT: v_bfe_i32 v25, v17, 0, 1
; GFX8-NEXT: v_bfe_i32 v23, v18, 0, 1
-; GFX8-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NEXT: s_addc_u32 s9, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_bfe_i32 v4, v24, 0, 1
; GFX8-NEXT: v_bfe_i32 v19, v26, 0, 1
@@ -9270,31 +9270,31 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[23:26]
; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s12
; GFX8-NEXT: v_mov_b32_e32 v23, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 0x100
+; GFX8-NEXT: s_add_u32 s6, s4, 0x100
; GFX8-NEXT: v_bfe_i32 v25, v16, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v24, s7
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_lshrrev_b16_e64 v15, 7, s12
; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25
; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: s_add_u32 s6, s0, 0x1f0
+; GFX8-NEXT: s_add_u32 s6, s4, 0x1f0
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[23:26]
; GFX8-NEXT: v_bfe_i32 v16, v15, 0, 1
; GFX8-NEXT: v_bfe_i32 v14, v14, 0, 1
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_lshrrev_b16_e64 v12, 4, s12
; GFX8-NEXT: v_lshrrev_b16_e64 v13, 5, s12
; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: s_add_u32 s6, s0, 0x1e0
+; GFX8-NEXT: s_add_u32 s6, s4, 0x1e0
; GFX8-NEXT: v_bfe_i32 v21, v27, 0, 1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[14:17]
; GFX8-NEXT: v_bfe_i32 v29, v13, 0, 1
; GFX8-NEXT: v_bfe_i32 v27, v12, 0, 1
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_lshrrev_b16_e64 v10, 2, s12
; GFX8-NEXT: v_lshrrev_b16_e64 v11, 3, s12
@@ -9302,12 +9302,12 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_ashrrev_i32_e32 v30, 31, v29
; GFX8-NEXT: v_ashrrev_i32_e32 v28, 31, v27
; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: s_add_u32 s6, s0, 0x1d0
+; GFX8-NEXT: s_add_u32 s6, s4, 0x1d0
; GFX8-NEXT: v_bfe_i32 v23, v9, 0, 1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[27:30]
; GFX8-NEXT: v_bfe_i32 v11, v11, 0, 1
; GFX8-NEXT: v_bfe_i32 v9, v10, 0, 1
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v11
; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v9
@@ -9315,41 +9315,41 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_lshrrev_b16_e64 v8, 1, s12
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[9:12]
; GFX8-NEXT: v_bfe_i32 v14, v8, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v12, s4
-; GFX8-NEXT: s_add_u32 s4, s0, 0x1c0
-; GFX8-NEXT: v_mov_b32_e32 v13, s5
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v12, s2
+; GFX8-NEXT: s_add_u32 s2, s4, 0x1c0
+; GFX8-NEXT: v_mov_b32_e32 v13, s3
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v14
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_add_u32 s4, s0, 0xf0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_add_u32 s2, s4, 0xf0
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25
; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_add_u32 s4, s0, 0xe0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_add_u32 s2, s4, 0xe0
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[23:26]
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_add_u32 s4, s0, 0xd0
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_ashrrev_i32_e32 v22, 31, v21
; GFX8-NEXT: v_ashrrev_i32_e32 v20, 31, v19
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_add_u32 s2, s4, 0xd0
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[19:22]
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: s_add_u32 s0, s0, 0xc0
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
-; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xc0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
@@ -9724,115 +9724,115 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v64i1_to_v64i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s3, 0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s19, s5
+; GFX12-NEXT: s_mov_b32 s19, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s26, s3, 22
-; GFX12-NEXT: s_lshr_b32 s28, s3, 23
-; GFX12-NEXT: s_lshr_b32 s30, s3, 20
-; GFX12-NEXT: s_lshr_b32 s34, s3, 21
+; GFX12-NEXT: s_lshr_b32 s26, s1, 22
+; GFX12-NEXT: s_lshr_b32 s28, s1, 23
+; GFX12-NEXT: s_lshr_b32 s30, s1, 20
+; GFX12-NEXT: s_lshr_b32 s34, s1, 21
; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
-; GFX12-NEXT: s_lshr_b32 s20, s3, 18
+; GFX12-NEXT: s_lshr_b32 s20, s1, 18
; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
; GFX12-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v43, s27
; GFX12-NEXT: v_dual_mov_b32 v42, s26 :: v_dual_mov_b32 v45, s29
; GFX12-NEXT: v_dual_mov_b32 v44, s28 :: v_dual_mov_b32 v47, s31
-; GFX12-NEXT: s_lshr_b32 s22, s3, 19
+; GFX12-NEXT: s_lshr_b32 s22, s1, 19
; GFX12-NEXT: v_dual_mov_b32 v46, s30 :: v_dual_mov_b32 v49, s35
; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
; GFX12-NEXT: v_mov_b32_e32 v48, s34
-; GFX12-NEXT: s_lshr_b32 s24, s3, 16
-; GFX12-NEXT: s_lshr_b32 s36, s3, 17
+; GFX12-NEXT: s_lshr_b32 s24, s1, 16
+; GFX12-NEXT: s_lshr_b32 s36, s1, 17
; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
-; GFX12-NEXT: s_lshr_b32 s12, s2, 22
+; GFX12-NEXT: s_lshr_b32 s12, s0, 22
; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:432
-; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:416
+; GFX12-NEXT: global_store_b128 v12, v[42:45], s[4:5] offset:432
+; GFX12-NEXT: global_store_b128 v12, v[46:49], s[4:5] offset:416
; GFX12-NEXT: v_dual_mov_b32 v43, s21 :: v_dual_mov_b32 v42, s20
; GFX12-NEXT: v_dual_mov_b32 v45, s23 :: v_dual_mov_b32 v44, s22
; GFX12-NEXT: v_mov_b32_e32 v47, s25
-; GFX12-NEXT: s_lshr_b32 s14, s2, 23
+; GFX12-NEXT: s_lshr_b32 s14, s0, 23
; GFX12-NEXT: v_dual_mov_b32 v46, s24 :: v_dual_mov_b32 v49, s37
; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
; GFX12-NEXT: v_mov_b32_e32 v48, s36
-; GFX12-NEXT: s_lshr_b32 s16, s2, 20
-; GFX12-NEXT: s_lshr_b32 s40, s2, 21
+; GFX12-NEXT: s_lshr_b32 s16, s0, 20
+; GFX12-NEXT: s_lshr_b32 s40, s0, 21
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
-; GFX12-NEXT: s_lshr_b32 s6, s2, 18
+; GFX12-NEXT: s_lshr_b32 s6, s0, 18
; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:400
-; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:384
+; GFX12-NEXT: global_store_b128 v12, v[42:45], s[4:5] offset:400
+; GFX12-NEXT: global_store_b128 v12, v[46:49], s[4:5] offset:384
; GFX12-NEXT: v_dual_mov_b32 v43, s13 :: v_dual_mov_b32 v42, s12
; GFX12-NEXT: v_dual_mov_b32 v45, s15 :: v_dual_mov_b32 v44, s14
; GFX12-NEXT: v_mov_b32_e32 v47, s17
-; GFX12-NEXT: s_lshr_b32 s8, s2, 19
+; GFX12-NEXT: s_lshr_b32 s8, s0, 19
; GFX12-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v49, s41
; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
; GFX12-NEXT: v_mov_b32_e32 v48, s40
-; GFX12-NEXT: s_lshr_b32 s10, s2, 16
+; GFX12-NEXT: s_lshr_b32 s10, s0, 16
; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
-; GFX12-NEXT: v_lshrrev_b16 v3, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 12, s2
-; GFX12-NEXT: v_lshrrev_b16 v9, 13, s2
+; GFX12-NEXT: v_lshrrev_b16 v3, 14, s0
+; GFX12-NEXT: v_lshrrev_b16 v5, 15, s0
+; GFX12-NEXT: v_lshrrev_b16 v7, 12, s0
+; GFX12-NEXT: v_lshrrev_b16 v9, 13, s0
; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:176
-; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:160
+; GFX12-NEXT: global_store_b128 v12, v[42:45], s[4:5] offset:176
+; GFX12-NEXT: global_store_b128 v12, v[46:49], s[4:5] offset:160
; GFX12-NEXT: v_dual_mov_b32 v43, s7 :: v_dual_mov_b32 v42, s6
; GFX12-NEXT: v_dual_mov_b32 v45, s9 :: v_dual_mov_b32 v44, s8
; GFX12-NEXT: v_mov_b32_e32 v47, s11
-; GFX12-NEXT: s_lshr_b32 s42, s2, 17
-; GFX12-NEXT: v_lshrrev_b16 v32, 10, s2
+; GFX12-NEXT: s_lshr_b32 s42, s0, 17
+; GFX12-NEXT: v_lshrrev_b16 v32, 10, s0
; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
-; GFX12-NEXT: v_lshrrev_b16 v34, 11, s2
-; GFX12-NEXT: v_lshrrev_b16 v33, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v35, 9, s2
-; GFX12-NEXT: v_lshrrev_b16 v27, 6, s2
-; GFX12-NEXT: v_lshrrev_b16 v29, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v30, 4, s2
-; GFX12-NEXT: v_lshrrev_b16 v31, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v24, 2, s2
-; GFX12-NEXT: v_lshrrev_b16 v25, 3, s2
-; GFX12-NEXT: v_lshrrev_b16 v23, 1, s2
-; GFX12-NEXT: v_lshrrev_b16 v18, 14, s3
-; GFX12-NEXT: v_lshrrev_b16 v20, 15, s3
-; GFX12-NEXT: v_lshrrev_b16 v16, 12, s3
-; GFX12-NEXT: v_lshrrev_b16 v19, 13, s3
-; GFX12-NEXT: v_lshrrev_b16 v0, 10, s3
-; GFX12-NEXT: v_lshrrev_b16 v1, 11, s3
-; GFX12-NEXT: v_lshrrev_b16 v13, 8, s3
-; GFX12-NEXT: v_lshrrev_b16 v15, 9, s3
-; GFX12-NEXT: v_lshrrev_b16 v14, 6, s3
-; GFX12-NEXT: v_lshrrev_b16 v17, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v22, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v26, 2, s3
-; GFX12-NEXT: v_lshrrev_b16 v28, 3, s3
-; GFX12-NEXT: v_lshrrev_b16 v36, 1, s3
-; GFX12-NEXT: s_lshr_b32 s18, s3, 24
-; GFX12-NEXT: s_mov_b32 s4, s3
-; GFX12-NEXT: s_lshr_b32 s38, s2, 24
+; GFX12-NEXT: v_lshrrev_b16 v34, 11, s0
+; GFX12-NEXT: v_lshrrev_b16 v33, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v35, 9, s0
+; GFX12-NEXT: v_lshrrev_b16 v27, 6, s0
+; GFX12-NEXT: v_lshrrev_b16 v29, 7, s0
+; GFX12-NEXT: v_lshrrev_b16 v30, 4, s0
+; GFX12-NEXT: v_lshrrev_b16 v31, 5, s0
+; GFX12-NEXT: v_lshrrev_b16 v24, 2, s0
+; GFX12-NEXT: v_lshrrev_b16 v25, 3, s0
+; GFX12-NEXT: v_lshrrev_b16 v23, 1, s0
+; GFX12-NEXT: v_lshrrev_b16 v18, 14, s1
+; GFX12-NEXT: v_lshrrev_b16 v20, 15, s1
+; GFX12-NEXT: v_lshrrev_b16 v16, 12, s1
+; GFX12-NEXT: v_lshrrev_b16 v19, 13, s1
+; GFX12-NEXT: v_lshrrev_b16 v0, 10, s1
+; GFX12-NEXT: v_lshrrev_b16 v1, 11, s1
+; GFX12-NEXT: v_lshrrev_b16 v13, 8, s1
+; GFX12-NEXT: v_lshrrev_b16 v15, 9, s1
+; GFX12-NEXT: v_lshrrev_b16 v14, 6, s1
+; GFX12-NEXT: v_lshrrev_b16 v17, 7, s1
+; GFX12-NEXT: v_lshrrev_b16 v21, 4, s1
+; GFX12-NEXT: v_lshrrev_b16 v22, 5, s1
+; GFX12-NEXT: v_lshrrev_b16 v26, 2, s1
+; GFX12-NEXT: v_lshrrev_b16 v28, 3, s1
+; GFX12-NEXT: v_lshrrev_b16 v36, 1, s1
+; GFX12-NEXT: s_lshr_b32 s18, s1, 24
+; GFX12-NEXT: s_mov_b32 s2, s1
+; GFX12-NEXT: s_lshr_b32 s38, s0, 24
; GFX12-NEXT: v_dual_mov_b32 v46, s10 :: v_dual_mov_b32 v49, s43
; GFX12-NEXT: v_bfe_i32 v52, v5, 0, 1
; GFX12-NEXT: v_bfe_i32 v50, v3, 0, 1
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000
; GFX12-NEXT: v_mov_b32_e32 v48, s42
-; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:144
+; GFX12-NEXT: global_store_b128 v12, v[42:45], s[4:5] offset:144
; GFX12-NEXT: v_bfe_i32 v44, v9, 0, 1
; GFX12-NEXT: v_bfe_i32 v42, v7, 0, 1
; GFX12-NEXT: v_lshrrev_b16 v41, 2, s18
-; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:128
+; GFX12-NEXT: global_store_b128 v12, v[46:49], s[4:5] offset:128
; GFX12-NEXT: v_lshrrev_b16 v54, 3, s18
; GFX12-NEXT: v_lshrrev_b16 v56, 6, s38
; GFX12-NEXT: v_ashrrev_i32_e32 v53, 31, v52
@@ -9841,9 +9841,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i32_e32 v43, 31, v42
; GFX12-NEXT: v_bfe_i32 v46, v56, 0, 1
; GFX12-NEXT: v_bfe_i32 v56, v54, 0, 1
-; GFX12-NEXT: global_store_b128 v12, v[50:53], s[0:1] offset:112
+; GFX12-NEXT: global_store_b128 v12, v[50:53], s[4:5] offset:112
; GFX12-NEXT: v_bfe_i32 v34, v34, 0, 1
-; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v12, v[42:45], s[4:5] offset:96
; GFX12-NEXT: v_bfe_i32 v32, v32, 0, 1
; GFX12-NEXT: v_bfe_i32 v54, v41, 0, 1
; GFX12-NEXT: v_bfe_i32 v43, v35, 0, 1
@@ -9855,9 +9855,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i32_e32 v42, 31, v41
; GFX12-NEXT: v_lshrrev_b16 v40, 5, s18
; GFX12-NEXT: v_lshrrev_b16 v37, 6, s18
-; GFX12-NEXT: global_store_b128 v12, v[32:35], s[0:1] offset:80
+; GFX12-NEXT: global_store_b128 v12, v[32:35], s[4:5] offset:80
; GFX12-NEXT: v_bfe_i32 v32, v39, 0, 1
-; GFX12-NEXT: global_store_b128 v12, v[41:44], s[0:1] offset:64
+; GFX12-NEXT: global_store_b128 v12, v[41:44], s[4:5] offset:64
; GFX12-NEXT: v_bfe_i32 v41, v29, 0, 1
; GFX12-NEXT: v_bfe_i32 v39, v27, 0, 1
; GFX12-NEXT: v_bfe_i32 v34, v40, 0, 1
@@ -9869,23 +9869,23 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v62, v37, 0, 1
; GFX12-NEXT: v_ashrrev_i32_e32 v61, 31, v60
; GFX12-NEXT: v_ashrrev_i32_e32 v59, 31, v58
-; GFX12-NEXT: global_store_b128 v12, v[39:42], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v12, v[39:42], s[4:5] offset:48
; GFX12-NEXT: v_bfe_i32 v39, v25, 0, 1
; GFX12-NEXT: v_bfe_i32 v37, v24, 0, 1
; GFX12-NEXT: v_bfe_i32 v64, v38, 0, 1
-; GFX12-NEXT: global_store_b128 v12, v[58:61], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v12, v[58:61], s[4:5] offset:32
; GFX12-NEXT: v_bfe_i32 v43, v23, 0, 1
; GFX12-NEXT: v_ashrrev_i32_e32 v40, 31, v39
; GFX12-NEXT: v_ashrrev_i32_e32 v38, 31, v37
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
; GFX12-NEXT: v_bfe_i32 v24, v36, 0, 1
; GFX12-NEXT: v_ashrrev_i32_e32 v44, 31, v43
-; GFX12-NEXT: v_dual_mov_b32 v41, s2 :: v_dual_mov_b32 v42, s3
-; GFX12-NEXT: v_mov_b32_e32 v23, s5
-; GFX12-NEXT: global_store_b128 v12, v[37:40], s[0:1] offset:16
+; GFX12-NEXT: v_dual_mov_b32 v41, s0 :: v_dual_mov_b32 v42, s1
+; GFX12-NEXT: v_mov_b32_e32 v23, s3
+; GFX12-NEXT: global_store_b128 v12, v[37:40], s[4:5] offset:16
; GFX12-NEXT: v_bfe_i32 v38, v20, 0, 1
; GFX12-NEXT: v_bfe_i32 v36, v18, 0, 1
-; GFX12-NEXT: global_store_b128 v12, v[41:44], s[0:1]
+; GFX12-NEXT: global_store_b128 v12, v[41:44], s[4:5]
; GFX12-NEXT: v_bfe_i32 v20, v19, 0, 1
; GFX12-NEXT: v_bfe_i32 v18, v16, 0, 1
; GFX12-NEXT: v_ashrrev_i32_e32 v39, 31, v38
@@ -9901,8 +9901,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v11, 4, s38
; GFX12-NEXT: v_lshrrev_b16 v2, 1, s38
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v12, v[36:39], s[0:1] offset:368
-; GFX12-NEXT: global_store_b128 v12, v[18:21], s[0:1] offset:352
+; GFX12-NEXT: global_store_b128 v12, v[36:39], s[4:5] offset:368
+; GFX12-NEXT: global_store_b128 v12, v[18:21], s[4:5] offset:352
; GFX12-NEXT: v_bfe_i32 v38, v1, 0, 1
; GFX12-NEXT: v_bfe_i32 v36, v0, 0, 1
; GFX12-NEXT: v_bfe_i32 v52, v55, 0, 1
@@ -9932,7 +9932,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i32_e32 v33, 31, v32
; GFX12-NEXT: v_ashrrev_i32_e32 v21, 31, v20
; GFX12-NEXT: v_ashrrev_i32_e32 v19, 31, v18
-; GFX12-NEXT: v_dual_mov_b32 v22, s4 :: v_dual_mov_b32 v51, s9
+; GFX12-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v51, s9
; GFX12-NEXT: v_dual_mov_b32 v50, s8 :: v_dual_mov_b32 v1, s7
; GFX12-NEXT: v_ashrrev_i32_e32 v49, 31, v48
; GFX12-NEXT: v_ashrrev_i32_e32 v47, 31, v46
@@ -9949,22 +9949,22 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX12-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GFX12-NEXT: s_clause 0x7
-; GFX12-NEXT: global_store_b128 v12, v[36:39], s[0:1] offset:336
-; GFX12-NEXT: global_store_b128 v12, v[18:21], s[0:1] offset:320
-; GFX12-NEXT: global_store_b128 v12, v[14:17], s[0:1] offset:304
-; GFX12-NEXT: global_store_b128 v12, v[40:43], s[0:1] offset:288
-; GFX12-NEXT: global_store_b128 v12, v[26:29], s[0:1] offset:272
-; GFX12-NEXT: global_store_b128 v12, v[22:25], s[0:1] offset:256
-; GFX12-NEXT: global_store_b128 v12, v[62:65], s[0:1] offset:496
-; GFX12-NEXT: global_store_b128 v12, v[32:35], s[0:1] offset:480
+; GFX12-NEXT: global_store_b128 v12, v[36:39], s[4:5] offset:336
+; GFX12-NEXT: global_store_b128 v12, v[18:21], s[4:5] offset:320
+; GFX12-NEXT: global_store_b128 v12, v[14:17], s[4:5] offset:304
+; GFX12-NEXT: global_store_b128 v12, v[40:43], s[4:5] offset:288
+; GFX12-NEXT: global_store_b128 v12, v[26:29], s[4:5] offset:272
+; GFX12-NEXT: global_store_b128 v12, v[22:25], s[4:5] offset:256
+; GFX12-NEXT: global_store_b128 v12, v[62:65], s[4:5] offset:496
+; GFX12-NEXT: global_store_b128 v12, v[32:35], s[4:5] offset:480
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v12, v[54:57], s[0:1] offset:464
-; GFX12-NEXT: global_store_b128 v12, v[50:53], s[0:1] offset:448
-; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:240
-; GFX12-NEXT: global_store_b128 v12, v[8:11], s[0:1] offset:224
-; GFX12-NEXT: global_store_b128 v12, v[4:7], s[0:1] offset:208
-; GFX12-NEXT: global_store_b128 v12, v[0:3], s[0:1] offset:192
+; GFX12-NEXT: global_store_b128 v12, v[54:57], s[4:5] offset:464
+; GFX12-NEXT: global_store_b128 v12, v[50:53], s[4:5] offset:448
+; GFX12-NEXT: global_store_b128 v12, v[46:49], s[4:5] offset:240
+; GFX12-NEXT: global_store_b128 v12, v[8:11], s[4:5] offset:224
+; GFX12-NEXT: global_store_b128 v12, v[4:7], s[4:5] offset:208
+; GFX12-NEXT: global_store_b128 v12, v[0:3], s[4:5] offset:192
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index a87fa8b..a5ca228 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -38,13 +38,13 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac
;
; GCN-NOHSA-VI-LABEL: constant_load_i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_ushort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: flat_store_short v[0:1], v2
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -77,12 +77,12 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac
;
; GFX12-LABEL: constant_load_i16:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -119,13 +119,13 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp
;
; GCN-NOHSA-VI-LABEL: constant_load_v2i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
; GCN-NOHSA-VI-NEXT: flat_store_dword v[0:1], v2
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -147,12 +147,12 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v2i16:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -198,18 +198,18 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp
;
; GCN-NOHSA-VI-LABEL: constant_load_v3i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s0, 4
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s4, 4
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s5, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s0
; GCN-NOHSA-VI-NEXT: flat_store_short v[2:3], v4
; GCN-NOHSA-VI-NEXT: flat_store_dword v[0:1], v5
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -252,15 +252,15 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v3i16:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] offset:4
-; GFX12-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5] offset:4
+; GFX12-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -299,14 +299,14 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp
;
; GCN-NOHSA-VI-LABEL: constant_load_v4i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -328,13 +328,13 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v4i16:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -377,16 +377,16 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp
;
; GCN-NOHSA-VI-LABEL: constant_load_v8i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -408,14 +408,14 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v8i16:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -608,41 +608,41 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
;
; GCN-NOHSA-VI-LABEL: constant_load_v16i16_align2:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 14
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 12
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 10
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 8
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 6
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 4
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 30
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 28
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 26
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 14
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 12
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 10
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 8
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 6
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 4
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 30
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 28
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 26
; GCN-NOHSA-VI-NEXT: flat_load_ushort v16, v[0:1]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v17, v[2:3]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v18, v[4:5]
@@ -651,35 +651,35 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GCN-NOHSA-VI-NEXT: flat_load_ushort v21, v[10:11]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v22, v[12:13]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v23, v[14:15]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 24
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 22
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 20
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 18
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 2
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s0
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 24
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 22
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 20
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 18
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 16
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 2
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s2
; GCN-NOHSA-VI-NEXT: flat_load_ushort v0, v[0:1]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v24, v[2:3]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v4, v[4:5]
@@ -742,26 +742,26 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
;
; GFX12-LABEL: constant_load_v16i16_align2:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0xf
-; GFX12-NEXT: global_load_u16 v3, v8, s[0:1] offset:28
-; GFX12-NEXT: global_load_u16 v2, v8, s[0:1] offset:24
-; GFX12-NEXT: global_load_u16 v1, v8, s[0:1] offset:20
-; GFX12-NEXT: global_load_u16 v0, v8, s[0:1] offset:16
-; GFX12-NEXT: global_load_u16 v7, v8, s[0:1] offset:12
-; GFX12-NEXT: global_load_u16 v6, v8, s[0:1] offset:8
-; GFX12-NEXT: global_load_u16 v5, v8, s[0:1] offset:4
-; GFX12-NEXT: global_load_u16 v4, v8, s[0:1]
-; GFX12-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:30
-; GFX12-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:26
-; GFX12-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:22
-; GFX12-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:18
-; GFX12-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:14
-; GFX12-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:10
-; GFX12-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:6
-; GFX12-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:2
+; GFX12-NEXT: global_load_u16 v3, v8, s[2:3] offset:28
+; GFX12-NEXT: global_load_u16 v2, v8, s[2:3] offset:24
+; GFX12-NEXT: global_load_u16 v1, v8, s[2:3] offset:20
+; GFX12-NEXT: global_load_u16 v0, v8, s[2:3] offset:16
+; GFX12-NEXT: global_load_u16 v7, v8, s[2:3] offset:12
+; GFX12-NEXT: global_load_u16 v6, v8, s[2:3] offset:8
+; GFX12-NEXT: global_load_u16 v5, v8, s[2:3] offset:4
+; GFX12-NEXT: global_load_u16 v4, v8, s[2:3]
+; GFX12-NEXT: global_load_d16_hi_b16 v3, v8, s[2:3] offset:30
+; GFX12-NEXT: global_load_d16_hi_b16 v2, v8, s[2:3] offset:26
+; GFX12-NEXT: global_load_d16_hi_b16 v1, v8, s[2:3] offset:22
+; GFX12-NEXT: global_load_d16_hi_b16 v0, v8, s[2:3] offset:18
+; GFX12-NEXT: global_load_d16_hi_b16 v7, v8, s[2:3] offset:14
+; GFX12-NEXT: global_load_d16_hi_b16 v6, v8, s[2:3] offset:10
+; GFX12-NEXT: global_load_d16_hi_b16 v5, v8, s[2:3] offset:6
+; GFX12-NEXT: global_load_d16_hi_b16 v4, v8, s[2:3] offset:2
; GFX12-NEXT: s_wait_loadcnt 0x4
; GFX12-NEXT: global_store_b128 v[0:1], v[0:3], off
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -808,13 +808,13 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p
;
; GCN-NOHSA-VI-LABEL: constant_zextload_i16_to_i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_ushort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: flat_store_dword v[0:1], v2
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -837,12 +837,12 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p
;
; GFX12-LABEL: constant_zextload_i16_to_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u16 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u16 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -885,13 +885,13 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p
;
; GCN-NOHSA-VI-LABEL: constant_sextload_i16_to_i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_sshort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: flat_store_dword v[0:1], v2
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -915,12 +915,12 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p
;
; GFX12-LABEL: constant_sextload_i16_to_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_i16 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_i16 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -963,13 +963,13 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v1i16_to_v1i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_ushort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: flat_store_dword v[0:1], v2
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -992,12 +992,12 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v1i16_to_v1i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u16 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u16 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1040,13 +1040,13 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v1i16_to_v1i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_sshort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: flat_store_dword v[0:1], v2
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -1070,12 +1070,12 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v1i16_to_v1i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_i16 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_i16 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1118,16 +1118,16 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v2i16_to_v2i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s2, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s2, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s0, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
; GCN-NOHSA-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -1152,16 +1152,16 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v2i16_to_v2i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s3, s2, 0xffff
-; GFX12-NEXT: s_lshr_b32 s2, s2, 16
+; GFX12-NEXT: s_and_b32 s1, s0, 0xffff
+; GFX12-NEXT: s_lshr_b32 s0, s0, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1205,16 +1205,16 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v2i16_to_v2i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s0, s2, 16
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s1, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s1, s0, 16
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
; GCN-NOHSA-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -1240,16 +1240,16 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v2i16_to_v2i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_sext_i32_i16 s3, s2
-; GFX12-NEXT: s_ashr_i32 s2, s2, 16
+; GFX12-NEXT: s_sext_i32_i16 s1, s0
+; GFX12-NEXT: s_ashr_i32 s0, s0, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1298,18 +1298,18 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v3i16_to_v3i32:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s1
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s3, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s2, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s0, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
; GCN-NOHSA-VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -1338,16 +1338,16 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v3i16_to_v3i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s3, s3, 0xffff
-; GFX12-NEXT: s_and_b32 s4, s2, 0xffff
-; GFX12-NEXT: s_lshr_b32 s2, s2, 16
-; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4
-; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
-; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX12-NEXT: s_and_b32 s2, s0, 0xffff
+; GFX12-NEXT: s_lshr_b32 s0, s0, 16
+; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1397,17 +1397,17 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v3i16_to_v3i32:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s1
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s0, s2, 16
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s1, s3
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s2, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s2, s0, 16
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s1, s1
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s2
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
; GCN-NOHSA-VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -1440,16 +1440,16 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v3i16_to_v3i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s4, s2, 16
-; GFX12-NEXT: s_sext_i32_i16 s2, s2
-; GFX12-NEXT: s_sext_i32_i16 s3, s3
-; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s3
-; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: s_ashr_i32 s2, s0, 16
+; GFX12-NEXT: s_sext_i32_i16 s0, s0
+; GFX12-NEXT: s_sext_i32_i16 s1, s1
+; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s1
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1503,20 +1503,20 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v4i16_to_v4i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s3, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s3, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s2, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s1, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s0, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -1545,19 +1545,19 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v4i16_to_v4i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s4, s3, 16
-; GFX12-NEXT: s_and_b32 s3, s3, 0xffff
-; GFX12-NEXT: s_and_b32 s5, s2, 0xffff
-; GFX12-NEXT: s_lshr_b32 s2, s2, 16
+; GFX12-NEXT: s_lshr_b32 s2, s1, 16
+; GFX12-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX12-NEXT: s_and_b32 s3, s0, 0xffff
+; GFX12-NEXT: s_lshr_b32 s0, s0, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s4
-; GFX12-NEXT: v_mov_b32_e32 v2, s3
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1611,20 +1611,20 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v4i16_to_v4i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s0, s3, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s1, s2, 16
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s3, s3
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s2, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s2, s1, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s3, s0, 16
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s1, s1
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -1655,18 +1655,18 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v4i16_to_v4i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s4, s3, 16
-; GFX12-NEXT: s_ashr_i32 s5, s2, 16
-; GFX12-NEXT: s_sext_i32_i16 s2, s2
-; GFX12-NEXT: s_sext_i32_i16 s3, s3
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4
-; GFX12-NEXT: v_mov_b32_e32 v2, s3
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: s_ashr_i32 s2, s1, 16
+; GFX12-NEXT: s_ashr_i32 s3, s0, 16
+; GFX12-NEXT: s_sext_i32_i16 s0, s0
+; GFX12-NEXT: s_sext_i32_i16 s1, s1
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1743,34 +1743,34 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v8i16_to_v8i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s5, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s4, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s7, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s7, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s6, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s1, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s1, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s0, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s0, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s3, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s3, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s2, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v8i16_to_v8i32:
@@ -1807,26 +1807,26 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v8i16_to_v8i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s8, s7, 16
-; GFX12-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX12-NEXT: s_and_b32 s9, s6, 0xffff
-; GFX12-NEXT: s_lshr_b32 s6, s6, 16
-; GFX12-NEXT: s_lshr_b32 s2, s5, 16
-; GFX12-NEXT: s_and_b32 s3, s5, 0xffff
-; GFX12-NEXT: s_lshr_b32 s5, s4, 16
-; GFX12-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s6
+; GFX12-NEXT: s_lshr_b32 s8, s3, 16
+; GFX12-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX12-NEXT: s_and_b32 s9, s2, 0xffff
+; GFX12-NEXT: s_lshr_b32 s2, s2, 16
+; GFX12-NEXT: s_lshr_b32 s6, s1, 16
+; GFX12-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX12-NEXT: s_lshr_b32 s7, s0, 16
+; GFX12-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: v_dual_mov_b32 v0, s9 :: v_dual_mov_b32 v3, s8
-; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s5
-; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2
-; GFX12-NEXT: v_mov_b32_e32 v6, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v5, s7
+; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s6
+; GFX12-NEXT: v_mov_b32_e32 v6, s1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1903,34 +1903,34 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v8i16_to_v8i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s8, s5, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s9, s4, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s2, s7, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s3, s6, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s6, s1, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s7, s0, 16
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s8, s1
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s9, s0
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s0, s3, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s1, s2, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s3, s3
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s2, s2
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: constant_sextload_v8i16_to_v8i32:
@@ -1969,26 +1969,26 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v8i16_to_v8i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s8, s7, 16
-; GFX12-NEXT: s_ashr_i32 s9, s6, 16
-; GFX12-NEXT: s_sext_i32_i16 s6, s6
-; GFX12-NEXT: s_sext_i32_i16 s7, s7
-; GFX12-NEXT: s_ashr_i32 s2, s5, 16
-; GFX12-NEXT: s_ashr_i32 s3, s4, 16
-; GFX12-NEXT: s_sext_i32_i16 s5, s5
-; GFX12-NEXT: s_sext_i32_i16 s4, s4
+; GFX12-NEXT: s_ashr_i32 s8, s3, 16
+; GFX12-NEXT: s_ashr_i32 s9, s2, 16
+; GFX12-NEXT: s_sext_i32_i16 s2, s2
+; GFX12-NEXT: s_sext_i32_i16 s3, s3
+; GFX12-NEXT: s_ashr_i32 s6, s1, 16
+; GFX12-NEXT: s_ashr_i32 s7, s0, 16
+; GFX12-NEXT: s_sext_i32_i16 s1, s1
+; GFX12-NEXT: s_sext_i32_i16 s0, s0
; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s8
-; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s3
-; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2
-; GFX12-NEXT: v_mov_b32_e32 v6, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s8
+; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v5, s7
+; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s6
+; GFX12-NEXT: v_mov_b32_e32 v6, s1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2108,60 +2108,60 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s5, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s4, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s7, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s6, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s9, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s8, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s11, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s11, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s9, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s9, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s8, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s8, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s11, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s11, 0xffff
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s10, 16
; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 48
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 32
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s11
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s13, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s12, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s15, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s15, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s14, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 48
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 32
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s16
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s14
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s12
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s11
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v16i16_to_v16i32:
@@ -2219,40 +2219,40 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %
;
; GFX12-LABEL: constant_zextload_v16i16_to_v16i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0
+; GFX12-NEXT: s_load_b256 s[8:15], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s16, s11, 16
-; GFX12-NEXT: s_and_b32 s11, s11, 0xffff
-; GFX12-NEXT: s_and_b32 s17, s10, 0xffff
-; GFX12-NEXT: s_lshr_b32 s10, s10, 16
-; GFX12-NEXT: s_lshr_b32 s14, s9, 16
-; GFX12-NEXT: s_and_b32 s9, s9, 0xffff
-; GFX12-NEXT: s_lshr_b32 s15, s8, 16
-; GFX12-NEXT: s_and_b32 s8, s8, 0xffff
-; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s10
-; GFX12-NEXT: s_lshr_b32 s12, s7, 16
-; GFX12-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX12-NEXT: s_lshr_b32 s13, s6, 16
-; GFX12-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX12-NEXT: s_lshr_b32 s16, s15, 16
+; GFX12-NEXT: s_and_b32 s15, s15, 0xffff
+; GFX12-NEXT: s_and_b32 s17, s14, 0xffff
+; GFX12-NEXT: s_lshr_b32 s14, s14, 16
+; GFX12-NEXT: s_lshr_b32 s0, s9, 16
+; GFX12-NEXT: s_and_b32 s1, s9, 0xffff
+; GFX12-NEXT: s_lshr_b32 s2, s8, 16
+; GFX12-NEXT: s_and_b32 s3, s8, 0xffff
+; GFX12-NEXT: s_lshr_b32 s6, s11, 16
+; GFX12-NEXT: s_and_b32 s7, s11, 0xffff
+; GFX12-NEXT: s_lshr_b32 s8, s10, 16
+; GFX12-NEXT: s_and_b32 s9, s10, 0xffff
+; GFX12-NEXT: s_lshr_b32 s10, s13, 16
+; GFX12-NEXT: s_and_b32 s11, s13, 0xffff
+; GFX12-NEXT: s_lshr_b32 s13, s12, 16
+; GFX12-NEXT: s_and_b32 s12, s12, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s14
; GFX12-NEXT: v_dual_mov_b32 v0, s17 :: v_dual_mov_b32 v3, s16
-; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v5, s15
-; GFX12-NEXT: s_lshr_b32 s2, s5, 16
-; GFX12-NEXT: s_and_b32 s3, s5, 0xffff
-; GFX12-NEXT: s_lshr_b32 s5, s4, 16
-; GFX12-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s14
-; GFX12-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v9, s13
-; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s12
-; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s5
-; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2
-; GFX12-NEXT: v_mov_b32_e32 v14, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s13
+; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s10
+; GFX12-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v9, s8
+; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v11, s6
+; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s2
+; GFX12-NEXT: v_dual_mov_b32 v12, s3 :: v_dual_mov_b32 v15, s0
+; GFX12-NEXT: v_mov_b32_e32 v14, s1
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2372,60 +2372,60 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v16i16_to_v16i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s12, s5, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s13, s4, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s14, s7, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s15, s6, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s16, s9, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s17, s8, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s2, s11, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s3, s10, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 48
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s11
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s10, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 32
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s2, s9, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s3, s8, 16
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s9
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s8
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s8, s11, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s9, s10, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s16, s13, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s17, s12, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s0, s15, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s1, s14, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 48
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s15, s15
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s14
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 32
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s9, s9
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s8, s8
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s13, s13
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s12, s12
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s16
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s14
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s11
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s10, s10
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s12
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: constant_sextload_v16i16_to_v16i32:
@@ -2487,40 +2487,40 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
;
; GFX12-LABEL: constant_sextload_v16i16_to_v16i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0
+; GFX12-NEXT: s_load_b256 s[8:15], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s16, s11, 16
-; GFX12-NEXT: s_ashr_i32 s17, s10, 16
-; GFX12-NEXT: s_sext_i32_i16 s10, s10
-; GFX12-NEXT: s_sext_i32_i16 s11, s11
-; GFX12-NEXT: s_ashr_i32 s14, s9, 16
-; GFX12-NEXT: s_ashr_i32 s15, s8, 16
-; GFX12-NEXT: s_sext_i32_i16 s9, s9
-; GFX12-NEXT: s_sext_i32_i16 s8, s8
+; GFX12-NEXT: s_ashr_i32 s16, s15, 16
+; GFX12-NEXT: s_ashr_i32 s17, s14, 16
+; GFX12-NEXT: s_sext_i32_i16 s14, s14
+; GFX12-NEXT: s_sext_i32_i16 s15, s15
+; GFX12-NEXT: s_ashr_i32 s0, s9, 16
+; GFX12-NEXT: s_ashr_i32 s1, s8, 16
+; GFX12-NEXT: s_sext_i32_i16 s2, s9
+; GFX12-NEXT: s_sext_i32_i16 s3, s8
+; GFX12-NEXT: s_ashr_i32 s6, s11, 16
+; GFX12-NEXT: s_ashr_i32 s7, s10, 16
+; GFX12-NEXT: s_sext_i32_i16 s8, s11
+; GFX12-NEXT: s_sext_i32_i16 s9, s10
+; GFX12-NEXT: s_ashr_i32 s10, s13, 16
+; GFX12-NEXT: s_ashr_i32 s11, s12, 16
+; GFX12-NEXT: s_sext_i32_i16 s13, s13
+; GFX12-NEXT: s_sext_i32_i16 s12, s12
; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s17
-; GFX12-NEXT: s_ashr_i32 s12, s7, 16
-; GFX12-NEXT: s_ashr_i32 s13, s6, 16
-; GFX12-NEXT: s_sext_i32_i16 s7, s7
-; GFX12-NEXT: s_sext_i32_i16 s6, s6
-; GFX12-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v3, s16
-; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v5, s15
-; GFX12-NEXT: s_ashr_i32 s2, s5, 16
-; GFX12-NEXT: s_ashr_i32 s3, s4, 16
-; GFX12-NEXT: s_sext_i32_i16 s5, s5
-; GFX12-NEXT: s_sext_i32_i16 s4, s4
-; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s14
-; GFX12-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v9, s13
-; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s12
-; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s3
-; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2
-; GFX12-NEXT: v_mov_b32_e32 v14, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v3, s16
+; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s11
+; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s10
+; GFX12-NEXT: v_dual_mov_b32 v6, s13 :: v_dual_mov_b32 v9, s7
+; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v11, s6
+; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v13, s1
+; GFX12-NEXT: v_dual_mov_b32 v12, s3 :: v_dual_mov_b32 v15, s0
+; GFX12-NEXT: v_mov_b32_e32 v14, s2
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5397,14 +5397,14 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p
;
; GCN-NOHSA-VI-LABEL: constant_zextload_i16_to_i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_ushort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GCN-NOHSA-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -5429,13 +5429,13 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p
;
; GFX12-LABEL: constant_zextload_i16_to_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v1, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5485,13 +5485,13 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p
;
; GCN-NOHSA-VI-LABEL: constant_sextload_i16_to_i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_ushort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v2, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
@@ -5519,15 +5519,15 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p
;
; GFX12-LABEL: constant_sextload_i16_to_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5572,14 +5572,14 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v1i16_to_v1i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_ushort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GCN-NOHSA-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -5604,13 +5604,13 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v1i16_to_v1i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v1, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5655,13 +5655,13 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v1i16_to_v1i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_ushort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v2, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
@@ -5689,15 +5689,15 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v1i16_to_v1i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5744,18 +5744,18 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v2i16_to_v2i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s2, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s2, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s0, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -5782,17 +5782,17 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v2i16_to_v2i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s3, 0xffff, s2
+; GFX12-NEXT: s_and_b32 s1, 0xffff, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
-; GFX12-NEXT: s_pack_hl_b32_b16 s2, s2, 0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s1
+; GFX12-NEXT: s_pack_hl_b32_b16 s0, s0, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5841,19 +5841,19 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v2i16_to_v2i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[0:1], 0x100000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -5882,17 +5882,17 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v2i16_to_v2i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s4, s2, 16
+; GFX12-NEXT: s_lshr_b32 s2, s0, 16
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5
-; GFX12-NEXT: v_mov_b32_e32 v2, s4
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5954,28 +5954,28 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v4i16_to_v4i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s2, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s2, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s3, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s0, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s0, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s1, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v4i16_to_v4i64:
@@ -6009,22 +6009,22 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v4i16_to_v4i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s4, 0xffff, s2
+; GFX12-NEXT: s_and_b32 s2, 0xffff, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
-; GFX12-NEXT: s_pack_hl_b32_b16 s2, s2, 0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: s_pack_hl_b32_b16 s0, s0, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_pack_hl_b32_b16 s2, s3, 0
-; GFX12-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: s_pack_hl_b32_b16 s0, s1, 0
+; GFX12-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6094,32 +6094,32 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v4i16_to_v4i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s3
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s3, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[0:1], 0x100000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s1
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s1, 16
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 16
+; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s4, 16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s5, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -6156,25 +6156,25 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v4i16_to_v4i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s6, s3
-; GFX12-NEXT: s_lshr_b32 s8, s3, 16
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x100000
-; GFX12-NEXT: s_lshr_b32 s2, s2, 16
+; GFX12-NEXT: s_mov_b32 s6, s1
+; GFX12-NEXT: s_lshr_b32 s8, s1, 16
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[0:1], 0x100000
+; GFX12-NEXT: s_lshr_b32 s0, s0, 16
; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
-; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v5, s7
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s7
; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v7, s9
-; GFX12-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6266,46 +6266,46 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v8i16_to_v8i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s4, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s5, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s6, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s7, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s7, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 48
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 32
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s0, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s0, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s1, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s1, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s2, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s3, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s3, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 48
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 32
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v8i16_to_v8i64:
@@ -6357,31 +6357,32 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v8i16_to_v8i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s2, 0xffff, s7
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: s_pack_hl_b32_b16 s3, s7, 0
-; GFX12-NEXT: s_pack_hl_b32_b16 s2, s6, 0
+; GFX12-NEXT: s_and_b32 s6, 0xffff, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT: s_pack_hl_b32_b16 s3, s3, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_and_b32 s3, 0xffff, s6
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: s_pack_hl_b32_b16 s2, s5, 0
-; GFX12-NEXT: s_and_b32 s3, 0xffff, s5
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: s_pack_hl_b32_b16 s2, s4, 0
-; GFX12-NEXT: s_and_b32 s3, 0xffff, s4
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: s_pack_hl_b32_b16 s3, s2, 0
+; GFX12-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s3
+; GFX12-NEXT: s_pack_hl_b32_b16 s2, s1, 0
+; GFX12-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:32
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: s_pack_hl_b32_b16 s1, s0, 0
+; GFX12-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6492,57 +6493,57 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v8i16_to_v8i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s7
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s5
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s7, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, s3
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[0:1], 0x100000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s1
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s1, 16
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s3, 16
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6
-; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 48
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s4, 48
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s5, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
+; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s4, 32
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7
-; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 32
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
+; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s4, 16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7
-; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 16
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -6598,38 +6599,38 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v8i16_to_v8i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s14, s7
-; GFX12-NEXT: s_lshr_b32 s16, s7, 16
-; GFX12-NEXT: s_bfe_i64 s[12:13], s[6:7], 0x100000
-; GFX12-NEXT: s_lshr_b32 s6, s6, 16
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x100000
-; GFX12-NEXT: s_mov_b32 s8, s5
-; GFX12-NEXT: s_lshr_b32 s10, s5, 16
+; GFX12-NEXT: s_mov_b32 s14, s3
+; GFX12-NEXT: s_lshr_b32 s16, s3, 16
+; GFX12-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x100000
+; GFX12-NEXT: s_lshr_b32 s2, s2, 16
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[0:1], 0x100000
+; GFX12-NEXT: s_mov_b32 s8, s1
+; GFX12-NEXT: s_lshr_b32 s10, s1, 16
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000
; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000
-; GFX12-NEXT: s_lshr_b32 s4, s4, 16
-; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
+; GFX12-NEXT: s_lshr_b32 s0, s0, 16
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s13
; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000
-; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v9, s15
+; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v9, s15
; GFX12-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v11, s17
-; GFX12-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
-; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v5, s3
-; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v13, s9
+; GFX12-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
+; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v5, s7
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v13, s9
; GFX12-NEXT: v_dual_mov_b32 v12, s8 :: v_dual_mov_b32 v15, s11
-; GFX12-NEXT: v_dual_mov_b32 v14, s10 :: v_dual_mov_b32 v7, s5
-; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: v_dual_mov_b32 v14, s10 :: v_dual_mov_b32 v7, s1
+; GFX12-NEXT: v_mov_b32_e32 v6, s0
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6781,82 +6782,82 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s4, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s5, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s6, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s7, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s10, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s11, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s8, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s8, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s9, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s9, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s10, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s10, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s11, 16
; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s8, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s9, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s9, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x50
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 64
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s14, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s15, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s12, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s13, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s13, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 0x50
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 64
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x70
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 0x70
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x60
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 0x60
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s15
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 48
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 48
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 32
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 32
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v16i16_to_v16i64:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index b0d8f72..5692d1d 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -34,13 +34,13 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac
;
; GFX8-NOHSA-LABEL: constant_load_i32:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: flat_store_dword v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -73,12 +73,12 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac
;
; GFX12-LABEL: constant_load_i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -117,14 +117,14 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-NOHSA-LABEL: constant_load_v2i32:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -158,13 +158,13 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v2i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -207,15 +207,15 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-NOHSA-LABEL: constant_load_v3i32:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -255,13 +255,13 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v3i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x0
+; GFX12-NEXT: s_load_b96 s[0:2], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4
-; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
-; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -304,16 +304,16 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-NOHSA-LABEL: constant_load_v4i32:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -349,14 +349,14 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v4i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -893,33 +893,33 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
;
; GFX8-NOHSA-LABEL: constant_load_v11i32:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x20
-; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20
+; GFX8-NOHSA-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s4, 16
+; GFX8-NOHSA-NEXT: s_addc_u32 s7, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s6
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s4
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s0
-; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s14
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx3 v[0:1], v[4:6]
; GFX8-NOHSA-NEXT: s_endpgm
@@ -1421,14 +1421,14 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p
;
; GFX8-NOHSA-LABEL: constant_zextload_i32_to_i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1462,12 +1462,12 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p
;
; GFX12-LABEL: constant_zextload_i32_to_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1508,15 +1508,15 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p
;
; GFX8-NOHSA-LABEL: constant_sextload_i32_to_i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s2, 31
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1552,15 +1552,15 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p
;
; GFX12-LABEL: constant_sextload_i32_to_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s3, s2, 31
+; GFX12-NEXT: s_ashr_i32 s1, s0, 31
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1599,14 +1599,14 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou
;
; GFX8-NOHSA-LABEL: constant_zextload_v1i32_to_v1i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1640,12 +1640,12 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v1i32_to_v1i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1686,15 +1686,15 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou
;
; GFX8-NOHSA-LABEL: constant_sextload_v1i32_to_v1i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s2, 31
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1730,15 +1730,15 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v1i32_to_v1i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s3, s2, 31
+; GFX12-NEXT: s_ashr_i32 s1, s0, 31
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1781,16 +1781,16 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou
;
; GFX8-NOHSA-LABEL: constant_zextload_v2i32_to_v2i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1829,14 +1829,14 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v2i32_to_v2i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1884,19 +1884,18 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
;
; GFX8-NOHSA-LABEL: constant_sextload_v2i32_to_v2i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s3, 31
-; GFX8-NOHSA-NEXT: s_mov_b32 s1, s3
-; GFX8-NOHSA-NEXT: s_ashr_i32 s3, s2, 31
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: s_ashr_i32 s2, s1, 31
+; GFX8-NOHSA-NEXT: s_ashr_i32 s3, s0, 31
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1940,17 +1939,17 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v2i32_to_v2i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s4, s3, 31
-; GFX12-NEXT: s_ashr_i32 s5, s2, 31
+; GFX12-NEXT: s_ashr_i32 s2, s1, 31
+; GFX12-NEXT: s_ashr_i32 s3, s0, 31
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4
-; GFX12-NEXT: v_mov_b32_e32 v2, s3
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2004,23 +2003,23 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou
;
; GFX8-NOHSA-LABEL: constant_zextload_v4i32_to_v4i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s4, 16
+; GFX8-NOHSA-NEXT: s_addc_u32 s7, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -2070,17 +2069,17 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v4i32_to_v4i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s7
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: v_mov_b32_e32 v2, s5
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s3
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2145,29 +2144,29 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
;
; GFX8-NOHSA-LABEL: constant_sextload_v4i32_to_v4i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_ashr_i32 s8, s5, 31
-; GFX8-NOHSA-NEXT: s_ashr_i32 s9, s4, 31
-; GFX8-NOHSA-NEXT: s_ashr_i32 s2, s7, 31
-; GFX8-NOHSA-NEXT: s_ashr_i32 s3, s6, 31
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: s_ashr_i32 s6, s1, 31
+; GFX8-NOHSA-NEXT: s_ashr_i32 s7, s0, 31
+; GFX8-NOHSA-NEXT: s_ashr_i32 s8, s3, 31
+; GFX8-NOHSA-NEXT: s_ashr_i32 s9, s2, 31
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -2226,22 +2225,22 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v4i32_to_v4i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s8, s7, 31
-; GFX12-NEXT: s_ashr_i32 s9, s6, 31
-; GFX12-NEXT: s_ashr_i32 s2, s5, 31
-; GFX12-NEXT: s_ashr_i32 s3, s4, 31
+; GFX12-NEXT: s_ashr_i32 s8, s3, 31
+; GFX12-NEXT: s_ashr_i32 s9, s2, 31
+; GFX12-NEXT: s_ashr_i32 s6, s1, 31
+; GFX12-NEXT: s_ashr_i32 s7, s0, 31
; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s8
-; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s3
-; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2
-; GFX12-NEXT: v_mov_b32_e32 v6, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s8
+; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v5, s7
+; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s6
+; GFX12-NEXT: v_mov_b32_e32 v6, s1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2683,32 +2682,32 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v8i32_to_v8i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0
+; GFX12-NEXT: s_load_b256 s[8:15], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s16, s11, 31
-; GFX12-NEXT: s_ashr_i32 s17, s10, 31
-; GFX12-NEXT: s_ashr_i32 s14, s9, 31
-; GFX12-NEXT: s_ashr_i32 s15, s8, 31
+; GFX12-NEXT: s_ashr_i32 s16, s15, 31
+; GFX12-NEXT: s_ashr_i32 s17, s14, 31
+; GFX12-NEXT: s_ashr_i32 s6, s13, 31
+; GFX12-NEXT: s_ashr_i32 s7, s12, 31
; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s17
-; GFX12-NEXT: s_ashr_i32 s12, s7, 31
-; GFX12-NEXT: s_ashr_i32 s13, s6, 31
-; GFX12-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v3, s16
-; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v5, s15
-; GFX12-NEXT: s_ashr_i32 s2, s5, 31
-; GFX12-NEXT: s_ashr_i32 s3, s4, 31
-; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s14
-; GFX12-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v9, s13
-; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s12
-; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s3
-; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2
-; GFX12-NEXT: v_mov_b32_e32 v14, s5
+; GFX12-NEXT: s_ashr_i32 s2, s11, 31
+; GFX12-NEXT: s_ashr_i32 s3, s10, 31
+; GFX12-NEXT: v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v3, s16
+; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s7
+; GFX12-NEXT: s_ashr_i32 s0, s9, 31
+; GFX12-NEXT: s_ashr_i32 s1, s8, 31
+; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s6
+; GFX12-NEXT: v_dual_mov_b32 v6, s13 :: v_dual_mov_b32 v9, s3
+; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s2
+; GFX12-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v13, s1
+; GFX12-NEXT: v_dual_mov_b32 v12, s8 :: v_dual_mov_b32 v15, s0
+; GFX12-NEXT: v_mov_b32_e32 v14, s9
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
index 66c73fda..9432584 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -34,14 +34,14 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac
;
; GFX8-LABEL: constant_load_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
@@ -63,13 +63,13 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac
;
; GFX12-LABEL: constant_load_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -111,16 +111,16 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: constant_load_v2i64:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
@@ -142,14 +142,14 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v2i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -205,24 +205,24 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: constant_load_v3i64:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x10
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s3
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x10
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NEXT: s_add_u32 s6, s4, 16
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v4, s7
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, s8
; GFX8-NEXT: v_mov_b32_e32 v6, s9
; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[5:6]
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
@@ -253,19 +253,19 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v3i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[8:9], s[2:3], 0x10
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[8:9], s[6:7], 0x10
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, s9
-; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_mov_b32_e32 v2, s6
+; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v6, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b64 v6, v[4:5], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v6, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 889755c..29ca6c6 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -39,13 +39,13 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace
;
; GFX8-NOHSA-LABEL: constant_load_i8:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_byte v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -78,12 +78,12 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace
;
; GFX12-LABEL: constant_load_i8:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b8 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -126,13 +126,13 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-NOHSA-LABEL: constant_load_v2i8:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_short v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -165,12 +165,12 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX12-LABEL: constant_load_v2i8:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -217,14 +217,14 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-NOHSA-LABEL: constant_load_v3i8:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 2
-; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NOHSA-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 2
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
@@ -278,14 +278,14 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX12-LABEL: constant_load_v3i8:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_d16_hi_b8 v0, v1, s[0:1] offset:2
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_d16_hi_b8 v0, v1, s[4:5] offset:2
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -322,13 +322,13 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-NOHSA-LABEL: constant_load_v4i8:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: flat_store_dword v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -350,12 +350,12 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX12-LABEL: constant_load_v4i8:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -394,14 +394,14 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-NOHSA-LABEL: constant_load_v8i8:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -423,13 +423,13 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX12-LABEL: constant_load_v8i8:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -472,16 +472,16 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-NOHSA-LABEL: constant_load_v16i8:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -503,14 +503,14 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v16i8:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -553,13 +553,13 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt
;
; GFX8-NOHSA-LABEL: constant_zextload_i8_to_i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_dword v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -582,12 +582,12 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_zextload_i8_to_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -630,13 +630,13 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt
;
; GFX8-NOHSA-LABEL: constant_sextload_i8_to_i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_sbyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_dword v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -660,12 +660,12 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_sextload_i8_to_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_i8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_i8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -708,13 +708,13 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v1i8_to_v1i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_dword v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -737,12 +737,12 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v1i8_to_v1i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -785,13 +785,13 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v1i8_to_v1i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_sbyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_dword v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -815,12 +815,12 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v1i8_to_v1i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_i8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_i8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -868,13 +868,13 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e32 v3, 8, v2
; GFX8-NOHSA-NEXT: v_and_b32_e32 v2, 0xff, v2
@@ -911,16 +911,16 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v2i8_to_v2i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 8, v0
; GFX12-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -968,13 +968,13 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v2i8_to_v2i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e32 v3, 8, v2
; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v2, 0, 8
@@ -1011,16 +1011,16 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v2i8_to_v2i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 8, v0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1070,17 +1070,17 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v3i8_to_v3i32:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff
-; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s2, 0x80010
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NOHSA-NEXT: s_and_b32 s1, s0, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s0
+; GFX8-NOHSA-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1112,17 +1112,17 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v3i8_to_v3i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT: s_and_b32 s3, s2, 0xff
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010
-; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX12-NEXT: s_and_b32 s1, s0, 0xff
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1173,18 +1173,18 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v3i8_to_v3i32:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s2
-; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s2, 0x80010
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s1, s2
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s0
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s0, 0x80010
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s0, s0
; GFX8-NOHSA-NEXT: v_bfe_i32 v1, v0, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1216,18 +1216,18 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v3i8_to_v3i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT: s_sext_i32_i8 s3, s2
-; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010
-; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX12-NEXT: s_sext_i32_i8 s1, s0
+; GFX12-NEXT: s_bfe_i32 s0, s0, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1280,19 +1280,19 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v4i8_to_v4i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24
-; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s2
-; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s2, 0x80010
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s0, 24
+; GFX8-NOHSA-NEXT: s_and_b32 s2, s0, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s0
+; GFX8-NOHSA-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1322,19 +1322,19 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v4i8_to_v4i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT: s_lshr_b32 s3, s2, 24
-; GFX12-NEXT: s_and_b32 s4, s2, 0xff
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX12-NEXT: s_lshr_b32 s1, s0, 24
+; GFX12-NEXT: s_and_b32 s2, s0, 0xff
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x80010
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1386,20 +1386,20 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v4i8_to_v4i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s2
-; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s2, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s2, 0x80010
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s2, s2
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s0
+; GFX8-NOHSA-NEXT: s_ashr_i32 s1, s0, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s2, s0, 0x80010
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s0, s0
; GFX8-NOHSA-NEXT: v_bfe_i32 v1, v0, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1431,19 +1431,19 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v4i8_to_v4i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT: s_ashr_i32 s3, s2, 24
-; GFX12-NEXT: s_sext_i32_i8 s4, s2
-; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX12-NEXT: s_ashr_i32 s1, s0, 24
+; GFX12-NEXT: s_sext_i32_i8 s2, s0
+; GFX12-NEXT: s_bfe_i32 s0, s0, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1518,30 +1518,30 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v8i8_to_v8i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s4
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s3, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s5, s2, 24
-; GFX8-NOHSA-NEXT: s_and_b32 s6, s3, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s3
-; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s3, 0x80010
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s2
-; GFX8-NOHSA-NEXT: s_and_b32 s7, s2, 0xff
-; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s2, 0x80010
-; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s1, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s0, 24
+; GFX8-NOHSA-NEXT: s_and_b32 s6, s1, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s1
+; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s0
+; GFX8-NOHSA-NEXT: s_and_b32 s7, s0, 0xff
+; GFX8-NOHSA-NEXT: s_bfe_u32 s0, s0, 0x80010
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
@@ -1583,26 +1583,26 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v8i8_to_v8i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3
-; GFX12-NEXT: s_lshr_b32 s5, s2, 24
-; GFX12-NEXT: s_and_b32 s7, s2, 0xff
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010
-; GFX12-NEXT: s_lshr_b32 s4, s3, 24
-; GFX12-NEXT: s_and_b32 s6, s3, 0xff
-; GFX12-NEXT: s_bfe_u32 s3, s3, 0x80010
-; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1
+; GFX12-NEXT: s_lshr_b32 s3, s0, 24
+; GFX12-NEXT: s_and_b32 s7, s0, 0xff
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX12-NEXT: s_lshr_b32 s2, s1, 24
+; GFX12-NEXT: s_and_b32 s6, s1, 0xff
+; GFX12-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s2
+; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_and_b32 v1, 0xffff, v1
; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_and_b32 v5, 0xffff, v5
-; GFX12-NEXT: v_mov_b32_e32 v6, s3
+; GFX12-NEXT: v_mov_b32_e32 v6, s1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1677,32 +1677,32 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s4
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_ashr_i32 s4, s3, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s5, s3, 0x80010
-; GFX8-NOHSA-NEXT: s_ashr_i32 s6, s2, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s7, s2, 0x80010
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s2
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s2, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 16
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s0
+; GFX8-NOHSA-NEXT: s_ashr_i32 s2, s1, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s1, 0x80010
+; GFX8-NOHSA-NEXT: s_ashr_i32 s6, s0, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s7, s0, 0x80010
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s0, s0
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s1
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
; GFX8-NOHSA-NEXT: v_bfe_i32 v3, v0, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s7
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s3
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s3, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
; GFX8-NOHSA-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
@@ -1747,28 +1747,28 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v8i8_to_v8i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3
-; GFX12-NEXT: s_ashr_i32 s6, s2, 24
-; GFX12-NEXT: s_sext_i32_i8 s7, s2
-; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010
-; GFX12-NEXT: s_ashr_i32 s4, s3, 24
-; GFX12-NEXT: s_bfe_i32 s5, s3, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s3, s3
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1
+; GFX12-NEXT: s_ashr_i32 s6, s0, 24
+; GFX12-NEXT: s_sext_i32_i8 s7, s0
+; GFX12-NEXT: s_bfe_i32 s0, s0, 0x80010
+; GFX12-NEXT: s_ashr_i32 s2, s1, 24
+; GFX12-NEXT: s_bfe_i32 s3, s1, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s1, s1
; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s6
-; GFX12-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX12-NEXT: v_mov_b32_e32 v4, s3
-; GFX12-NEXT: v_mov_b32_e32 v6, s5
+; GFX12-NEXT: v_mov_b32_e32 v4, s1
+; GFX12-NEXT: v_mov_b32_e32 v6, s3
; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1889,55 +1889,55 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
;
; GFX8-NOHSA-LABEL: constant_zextload_v16i8_to_v16i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s4, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s5, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s6, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s7, 24
-; GFX8-NOHSA-NEXT: s_and_b32 s11, s4, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s4
-; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s4, 0x80010
-; GFX8-NOHSA-NEXT: s_and_b32 s12, s5, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s5
-; GFX8-NOHSA-NEXT: s_bfe_u32 s5, s5, 0x80010
-; GFX8-NOHSA-NEXT: s_and_b32 s13, s6, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s6
-; GFX8-NOHSA-NEXT: s_bfe_u32 s6, s6, 0x80010
-; GFX8-NOHSA-NEXT: s_and_b32 s3, s7, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s7
-; GFX8-NOHSA-NEXT: s_bfe_u32 s7, s7, 0x80010
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s7
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s0, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s7, s1, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s2, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s3, 24
+; GFX8-NOHSA-NEXT: s_and_b32 s10, s0, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s0
+; GFX8-NOHSA-NEXT: s_bfe_u32 s11, s0, 0x80010
+; GFX8-NOHSA-NEXT: s_and_b32 s12, s1, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s1
+; GFX8-NOHSA-NEXT: s_bfe_u32 s13, s1, 0x80010
+; GFX8-NOHSA-NEXT: s_and_b32 s14, s2, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s2
+; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s2, 0x80010
+; GFX8-NOHSA-NEXT: s_and_b32 s0, s3, 0xff
+; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s3, 0x80010
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s9
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s13
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s10
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s14
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s8
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s13
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1999,40 +1999,40 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v16i8_to_v16i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s7
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s6
-; GFX12-NEXT: v_lshrrev_b16 v9, 8, s5
-; GFX12-NEXT: s_lshr_b32 s8, s6, 24
-; GFX12-NEXT: s_lshr_b32 s9, s7, 24
-; GFX12-NEXT: v_lshrrev_b16 v13, 8, s4
-; GFX12-NEXT: s_and_b32 s12, s6, 0xff
-; GFX12-NEXT: s_bfe_u32 s6, s6, 0x80010
-; GFX12-NEXT: s_and_b32 s13, s7, 0xff
-; GFX12-NEXT: s_bfe_u32 s7, s7, 0x80010
-; GFX12-NEXT: s_and_b32 s11, s5, 0xff
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s3
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v9, 8, s1
+; GFX12-NEXT: s_lshr_b32 s8, s2, 24
+; GFX12-NEXT: s_lshr_b32 s9, s3, 24
+; GFX12-NEXT: v_lshrrev_b16 v13, 8, s0
+; GFX12-NEXT: s_and_b32 s12, s2, 0xff
+; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010
+; GFX12-NEXT: s_and_b32 s13, s3, 0xff
+; GFX12-NEXT: s_bfe_u32 s3, s3, 0x80010
+; GFX12-NEXT: s_and_b32 s11, s1, 0xff
; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s9
-; GFX12-NEXT: s_lshr_b32 s3, s5, 24
-; GFX12-NEXT: s_bfe_u32 s5, s5, 0x80010
+; GFX12-NEXT: s_lshr_b32 s7, s1, 24
+; GFX12-NEXT: s_bfe_u32 s1, s1, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v0, s13 :: v_dual_mov_b32 v7, s8
-; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v11, s3
-; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX12-NEXT: s_lshr_b32 s2, s4, 24
-; GFX12-NEXT: s_and_b32 s10, s4, 0xff
-; GFX12-NEXT: s_bfe_u32 s4, s4, 0x80010
-; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v15, s2
+; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v11, s7
+; GFX12-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX12-NEXT: s_lshr_b32 s6, s0, 24
+; GFX12-NEXT: s_and_b32 s10, s0, 0xff
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v15, s6
; GFX12-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_and_b32 v5, 0xffff, v5
-; GFX12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_and_b32 v9, 0xffff, v9
+; GFX12-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_and_b32 v9, 0xffff, v9
; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_and_b32 v13, 0xffff, v13
-; GFX12-NEXT: v_mov_b32_e32 v14, s4
+; GFX12-NEXT: v_mov_b32_e32 v14, s0
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2153,59 +2153,59 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
;
; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_ashr_i32 s8, s4, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s9, s4, 0x80010
-; GFX8-NOHSA-NEXT: s_ashr_i32 s10, s5, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s11, s5, 0x80010
-; GFX8-NOHSA-NEXT: s_ashr_i32 s12, s6, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s13, s6, 0x80010
-; GFX8-NOHSA-NEXT: s_ashr_i32 s2, s7, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s7, 0x80010
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s3
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s7
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s7, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s1
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s0
+; GFX8-NOHSA-NEXT: s_ashr_i32 s6, s0, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s7, s0, 0x80010
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s8, s0
+; GFX8-NOHSA-NEXT: s_ashr_i32 s9, s1, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s10, s1, 0x80010
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s11, s1
+; GFX8-NOHSA-NEXT: s_ashr_i32 s12, s2, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s13, s2, 0x80010
+; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s3, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s3, 0x80010
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s3
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s3, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 32
; GFX8-NOHSA-NEXT: v_bfe_i32 v7, v0, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s7
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9]
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s3
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s6, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s2, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
; GFX8-NOHSA-NEXT: v_bfe_i32 v5, v2, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s13
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s12
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s5
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GFX8-NOHSA-NEXT: v_bfe_i32 v3, v3, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s11
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s10
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s4
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
; GFX8-NOHSA-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -2275,44 +2275,44 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v16i8_to_v16i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s7
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s6
-; GFX12-NEXT: v_lshrrev_b16 v9, 8, s5
-; GFX12-NEXT: v_lshrrev_b16 v13, 8, s4
-; GFX12-NEXT: s_ashr_i32 s12, s7, 24
-; GFX12-NEXT: s_sext_i32_i8 s13, s7
-; GFX12-NEXT: s_bfe_i32 s7, s7, 0x80010
-; GFX12-NEXT: s_ashr_i32 s10, s6, 24
-; GFX12-NEXT: s_bfe_i32 s11, s6, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s6, s6
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s3
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v9, 8, s1
+; GFX12-NEXT: v_lshrrev_b16 v13, 8, s0
+; GFX12-NEXT: s_ashr_i32 s12, s3, 24
+; GFX12-NEXT: s_sext_i32_i8 s13, s3
+; GFX12-NEXT: s_bfe_i32 s3, s3, 0x80010
+; GFX12-NEXT: s_ashr_i32 s10, s2, 24
+; GFX12-NEXT: s_bfe_i32 s11, s2, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s2, s2
; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s12
-; GFX12-NEXT: s_ashr_i32 s8, s5, 24
-; GFX12-NEXT: s_bfe_i32 s9, s5, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s5, s5
+; GFX12-NEXT: s_ashr_i32 s8, s1, 24
+; GFX12-NEXT: s_bfe_i32 s9, s1, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s1, s1
; GFX12-NEXT: v_dual_mov_b32 v0, s13 :: v_dual_mov_b32 v7, s10
-; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v11, s8
+; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v11, s8
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX12-NEXT: s_ashr_i32 s2, s4, 24
-; GFX12-NEXT: s_bfe_i32 s3, s4, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s4, s4
-; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v15, s2
+; GFX12-NEXT: s_ashr_i32 s6, s0, 24
+; GFX12-NEXT: s_bfe_i32 s7, s0, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s0, s0
+; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v15, s6
; GFX12-NEXT: v_mov_b32_e32 v6, s11
; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8
-; GFX12-NEXT: v_mov_b32_e32 v8, s5
+; GFX12-NEXT: v_mov_b32_e32 v8, s1
; GFX12-NEXT: v_mov_b32_e32 v10, s9
; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 8
-; GFX12-NEXT: v_mov_b32_e32 v12, s4
-; GFX12-NEXT: v_mov_b32_e32 v14, s3
+; GFX12-NEXT: v_mov_b32_e32 v12, s0
+; GFX12-NEXT: v_mov_b32_e32 v14, s7
; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 8
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2724,71 +2724,71 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v32i8_to_v32i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0
+; GFX12-NEXT: s_load_b256 s[8:15], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s11
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s10
-; GFX12-NEXT: v_lshrrev_b16 v9, 8, s9
-; GFX12-NEXT: v_lshrrev_b16 v10, 8, s8
-; GFX12-NEXT: v_lshrrev_b16 v11, 8, s7
-; GFX12-NEXT: s_lshr_b32 s15, s9, 24
-; GFX12-NEXT: s_lshr_b32 s17, s11, 24
-; GFX12-NEXT: v_lshrrev_b16 v12, 8, s6
-; GFX12-NEXT: s_and_b32 s23, s9, 0xff
-; GFX12-NEXT: s_bfe_u32 s9, s9, 0x80010
-; GFX12-NEXT: s_and_b32 s25, s11, 0xff
-; GFX12-NEXT: s_bfe_u32 s11, s11, 0x80010
-; GFX12-NEXT: s_lshr_b32 s14, s8, 24
-; GFX12-NEXT: s_lshr_b32 s16, s10, 24
-; GFX12-NEXT: v_lshrrev_b16 v14, 8, s5
-; GFX12-NEXT: s_and_b32 s22, s8, 0xff
-; GFX12-NEXT: s_bfe_u32 s8, s8, 0x80010
-; GFX12-NEXT: s_and_b32 s24, s10, 0xff
-; GFX12-NEXT: s_bfe_u32 s10, s10, 0x80010
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s15
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s14
+; GFX12-NEXT: v_lshrrev_b16 v9, 8, s13
+; GFX12-NEXT: v_lshrrev_b16 v10, 8, s12
+; GFX12-NEXT: v_lshrrev_b16 v11, 8, s11
+; GFX12-NEXT: s_lshr_b32 s7, s13, 24
+; GFX12-NEXT: s_lshr_b32 s17, s15, 24
+; GFX12-NEXT: v_lshrrev_b16 v12, 8, s10
+; GFX12-NEXT: s_and_b32 s23, s13, 0xff
+; GFX12-NEXT: s_bfe_u32 s13, s13, 0x80010
+; GFX12-NEXT: s_and_b32 s25, s15, 0xff
+; GFX12-NEXT: s_bfe_u32 s15, s15, 0x80010
+; GFX12-NEXT: s_lshr_b32 s6, s12, 24
+; GFX12-NEXT: s_lshr_b32 s16, s14, 24
+; GFX12-NEXT: v_lshrrev_b16 v14, 8, s9
+; GFX12-NEXT: s_and_b32 s22, s12, 0xff
+; GFX12-NEXT: s_bfe_u32 s12, s12, 0x80010
+; GFX12-NEXT: s_and_b32 s24, s14, 0xff
+; GFX12-NEXT: s_bfe_u32 s14, s14, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v3, s17
-; GFX12-NEXT: s_lshr_b32 s13, s7, 24
-; GFX12-NEXT: v_lshrrev_b16 v13, 8, s4
-; GFX12-NEXT: s_and_b32 s21, s7, 0xff
-; GFX12-NEXT: s_bfe_u32 s7, s7, 0x80010
+; GFX12-NEXT: s_lshr_b32 s3, s11, 24
+; GFX12-NEXT: v_lshrrev_b16 v13, 8, s8
+; GFX12-NEXT: s_and_b32 s21, s11, 0xff
+; GFX12-NEXT: s_bfe_u32 s11, s11, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v0, s25 :: v_dual_mov_b32 v7, s16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_and_b32 v13, 0xffff, v13
+; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_and_b32 v13, 0xffff, v13
; GFX12-NEXT: v_dual_mov_b32 v8, s23 :: v_dual_and_b32 v1, 0xffff, v1
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-NEXT: v_dual_mov_b32 v28, s22 :: v_dual_and_b32 v25, 0xffff, v11
-; GFX12-NEXT: v_dual_mov_b32 v30, s8 :: v_dual_and_b32 v29, 0xffff, v10
+; GFX12-NEXT: v_dual_mov_b32 v30, s12 :: v_dual_and_b32 v29, 0xffff, v10
; GFX12-NEXT: v_dual_mov_b32 v24, s21 :: v_dual_and_b32 v9, 0xffff, v9
-; GFX12-NEXT: v_dual_mov_b32 v10, s9 :: v_dual_mov_b32 v11, s15
-; GFX12-NEXT: v_mov_b32_e32 v26, s7
-; GFX12-NEXT: s_lshr_b32 s12, s6, 24
-; GFX12-NEXT: s_and_b32 s20, s6, 0xff
-; GFX12-NEXT: s_bfe_u32 s6, s6, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s7
+; GFX12-NEXT: v_mov_b32_e32 v26, s11
+; GFX12-NEXT: s_lshr_b32 s2, s10, 24
+; GFX12-NEXT: s_and_b32 s20, s10, 0xff
+; GFX12-NEXT: s_bfe_u32 s10, s10, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_and_b32 v17, 0xffff, v14
-; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_and_b32 v21, 0xffff, v12
-; GFX12-NEXT: v_dual_mov_b32 v31, s14 :: v_dual_mov_b32 v20, s20
-; GFX12-NEXT: s_lshr_b32 s3, s5, 24
-; GFX12-NEXT: s_and_b32 s19, s5, 0xff
-; GFX12-NEXT: s_bfe_u32 s5, s5, 0x80010
-; GFX12-NEXT: v_dual_mov_b32 v27, s13 :: v_dual_mov_b32 v22, s6
-; GFX12-NEXT: s_lshr_b32 s2, s4, 24
-; GFX12-NEXT: s_and_b32 s18, s4, 0xff
-; GFX12-NEXT: s_bfe_u32 s4, s4, 0x80010
-; GFX12-NEXT: v_dual_mov_b32 v23, s12 :: v_dual_mov_b32 v16, s19
-; GFX12-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s3
+; GFX12-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_and_b32 v21, 0xffff, v12
+; GFX12-NEXT: v_dual_mov_b32 v31, s6 :: v_dual_mov_b32 v20, s20
+; GFX12-NEXT: s_lshr_b32 s1, s9, 24
+; GFX12-NEXT: s_and_b32 s19, s9, 0xff
+; GFX12-NEXT: s_bfe_u32 s9, s9, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v27, s3 :: v_dual_mov_b32 v22, s10
+; GFX12-NEXT: s_lshr_b32 s0, s8, 24
+; GFX12-NEXT: s_and_b32 s18, s8, 0xff
+; GFX12-NEXT: s_bfe_u32 s8, s8, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v23, s2 :: v_dual_mov_b32 v16, s19
+; GFX12-NEXT: v_dual_mov_b32 v18, s9 :: v_dual_mov_b32 v19, s1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:96
-; GFX12-NEXT: v_dual_mov_b32 v12, s18 :: v_dual_mov_b32 v15, s2
-; GFX12-NEXT: v_mov_b32_e32 v14, s4
+; GFX12-NEXT: global_store_b128 v32, v[0:3], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v32, v[4:7], s[4:5] offset:96
+; GFX12-NEXT: v_dual_mov_b32 v12, s18 :: v_dual_mov_b32 v15, s0
+; GFX12-NEXT: v_mov_b32_e32 v14, s8
; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v32, v[8:11], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v32, v[28:31], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v32, v[24:27], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v32, v[20:23], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v32, v[16:19], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v32, v[12:15], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3231,78 +3231,78 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v32i8_to_v32i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0
+; GFX12-NEXT: s_load_b256 s[8:15], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s11
-; GFX12-NEXT: v_lshrrev_b16 v9, 8, s9
-; GFX12-NEXT: v_lshrrev_b16 v10, 8, s8
-; GFX12-NEXT: v_lshrrev_b16 v11, 8, s7
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s10
-; GFX12-NEXT: v_lshrrev_b16 v12, 8, s6
-; GFX12-NEXT: s_ashr_i32 s20, s9, 24
-; GFX12-NEXT: s_bfe_i32 s21, s9, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s9, s9
-; GFX12-NEXT: s_ashr_i32 s24, s11, 24
-; GFX12-NEXT: s_sext_i32_i8 s25, s11
-; GFX12-NEXT: s_bfe_i32 s11, s11, 0x80010
-; GFX12-NEXT: v_lshrrev_b16 v14, 8, s5
-; GFX12-NEXT: s_ashr_i32 s18, s8, 24
-; GFX12-NEXT: s_bfe_i32 s19, s8, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s8, s8
-; GFX12-NEXT: s_ashr_i32 s22, s10, 24
-; GFX12-NEXT: s_bfe_i32 s23, s10, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s10, s10
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s15
+; GFX12-NEXT: v_lshrrev_b16 v9, 8, s13
+; GFX12-NEXT: v_lshrrev_b16 v10, 8, s12
+; GFX12-NEXT: v_lshrrev_b16 v11, 8, s11
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s14
+; GFX12-NEXT: v_lshrrev_b16 v12, 8, s10
+; GFX12-NEXT: s_ashr_i32 s20, s13, 24
+; GFX12-NEXT: s_bfe_i32 s21, s13, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s13, s13
+; GFX12-NEXT: s_ashr_i32 s24, s15, 24
+; GFX12-NEXT: s_sext_i32_i8 s25, s15
+; GFX12-NEXT: s_bfe_i32 s15, s15, 0x80010
+; GFX12-NEXT: v_lshrrev_b16 v14, 8, s9
+; GFX12-NEXT: s_ashr_i32 s18, s12, 24
+; GFX12-NEXT: s_bfe_i32 s19, s12, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s12, s12
+; GFX12-NEXT: s_ashr_i32 s22, s14, 24
+; GFX12-NEXT: s_bfe_i32 s23, s14, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s14, s14
; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v3, s24
-; GFX12-NEXT: v_lshrrev_b16 v13, 8, s4
-; GFX12-NEXT: s_ashr_i32 s12, s5, 24
-; GFX12-NEXT: s_ashr_i32 s14, s6, 24
-; GFX12-NEXT: s_ashr_i32 s16, s7, 24
-; GFX12-NEXT: s_bfe_i32 s17, s7, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s7, s7
+; GFX12-NEXT: v_lshrrev_b16 v13, 8, s8
+; GFX12-NEXT: s_ashr_i32 s0, s8, 24
+; GFX12-NEXT: s_bfe_i32 s1, s8, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s2, s8
+; GFX12-NEXT: s_ashr_i32 s3, s9, 24
+; GFX12-NEXT: s_ashr_i32 s8, s10, 24
+; GFX12-NEXT: s_ashr_i32 s16, s11, 24
+; GFX12-NEXT: s_bfe_i32 s17, s11, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s11, s11
; GFX12-NEXT: v_dual_mov_b32 v0, s25 :: v_dual_mov_b32 v7, s22
-; GFX12-NEXT: v_mov_b32_e32 v2, s11
-; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v23, s14
+; GFX12-NEXT: v_mov_b32_e32 v2, s15
+; GFX12-NEXT: v_dual_mov_b32 v8, s13 :: v_dual_mov_b32 v23, s8
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX12-NEXT: v_bfe_i32 v25, v11, 0, 8
; GFX12-NEXT: v_bfe_i32 v29, v10, 0, 8
; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 8
-; GFX12-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v19, s12
+; GFX12-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v19, s3
; GFX12-NEXT: v_mov_b32_e32 v11, s20
-; GFX12-NEXT: s_ashr_i32 s2, s4, 24
-; GFX12-NEXT: s_bfe_i32 s15, s6, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s6, s6
-; GFX12-NEXT: v_dual_mov_b32 v4, s10 :: v_dual_mov_b32 v31, s18
+; GFX12-NEXT: s_bfe_i32 s6, s9, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s7, s9
+; GFX12-NEXT: s_bfe_i32 s9, s10, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s10, s10
+; GFX12-NEXT: v_dual_mov_b32 v4, s14 :: v_dual_mov_b32 v31, s18
; GFX12-NEXT: v_dual_mov_b32 v6, s23 :: v_dual_mov_b32 v27, s16
; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8
-; GFX12-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v15, s2
+; GFX12-NEXT: v_dual_mov_b32 v28, s12 :: v_dual_mov_b32 v15, s0
; GFX12-NEXT: v_mov_b32_e32 v30, s19
-; GFX12-NEXT: s_bfe_i32 s13, s5, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s5, s5
-; GFX12-NEXT: v_mov_b32_e32 v24, s7
+; GFX12-NEXT: v_mov_b32_e32 v24, s11
; GFX12-NEXT: v_mov_b32_e32 v26, s17
-; GFX12-NEXT: s_bfe_i32 s3, s4, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s4, s4
; GFX12-NEXT: v_bfe_i32 v21, v12, 0, 8
-; GFX12-NEXT: v_mov_b32_e32 v20, s6
-; GFX12-NEXT: v_mov_b32_e32 v22, s15
+; GFX12-NEXT: v_mov_b32_e32 v20, s10
+; GFX12-NEXT: v_mov_b32_e32 v22, s9
; GFX12-NEXT: v_bfe_i32 v17, v14, 0, 8
-; GFX12-NEXT: v_mov_b32_e32 v16, s5
-; GFX12-NEXT: v_mov_b32_e32 v18, s13
+; GFX12-NEXT: v_mov_b32_e32 v16, s7
+; GFX12-NEXT: v_mov_b32_e32 v18, s6
; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 8
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:96
-; GFX12-NEXT: v_mov_b32_e32 v12, s4
-; GFX12-NEXT: v_mov_b32_e32 v14, s3
+; GFX12-NEXT: global_store_b128 v32, v[0:3], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v32, v[4:7], s[4:5] offset:96
+; GFX12-NEXT: v_mov_b32_e32 v12, s2
+; GFX12-NEXT: v_mov_b32_e32 v14, s1
; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v32, v[8:11], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v32, v[28:31], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v32, v[24:27], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v32, v[20:23], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v32, v[16:19], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v32, v[12:15], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5234,14 +5234,14 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt
;
; GFX8-NOHSA-LABEL: constant_zextload_i8_to_i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -5266,13 +5266,13 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_zextload_i8_to_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v1, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5318,13 +5318,13 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt
;
; GFX8-NOHSA-LABEL: constant_sextload_i8_to_i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_sbyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
@@ -5352,15 +5352,15 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_sextload_i8_to_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_i8 v0, v2, s[2:3]
+; GFX12-NEXT: global_load_i8 v0, v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5405,13 +5405,13 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v1i8_to_v1i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -5436,12 +5436,12 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v1i8_to_v1i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5487,13 +5487,13 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v1i8_to_v1i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_sbyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
@@ -5521,15 +5521,15 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v1i8_to_v1i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_i8 v0, v2, s[2:3]
+; GFX12-NEXT: global_load_i8 v0, v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5580,14 +5580,14 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_and_b32_e32 v0, 0xff, v2
@@ -5627,16 +5627,16 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v2i8_to_v2i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v1, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v2, 8, v0
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xff, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5689,13 +5689,13 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v2i8_to_v2i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e32 v2, 8, v0
; GFX8-NOHSA-NEXT: v_bfe_i32 v0, v0, 0, 8
@@ -5738,10 +5738,10 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v2i8_to_v2i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v4, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v4, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 8, v0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 8
@@ -5750,7 +5750,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5812,28 +5812,28 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v4i8_to_v4i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s2, 24
-; GFX8-NOHSA-NEXT: s_and_b32 s4, s2, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s2
-; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s2, 0x80010
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s0, 24
+; GFX8-NOHSA-NEXT: s_and_b32 s2, s0, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s0
+; GFX8-NOHSA-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v4i8_to_v4i64:
@@ -5870,23 +5870,23 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v4i8_to_v4i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v4, 8, s2
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x80010
+; GFX12-NEXT: v_lshrrev_b16 v4, 8, s0
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x80010
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
-; GFX12-NEXT: s_lshr_b32 s4, s2, 24
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s1
+; GFX12-NEXT: s_lshr_b32 s2, s0, 24
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_and_b32 s2, s2, 0xff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: s_and_b32 s0, s0, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: v_mov_b32_e32 v2, v4
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5958,31 +5958,31 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v4i8_to_v4i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s2, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s2, 24
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s2
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
+; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s0, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s0, 24
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s0
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 16
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 16
; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v0, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -6021,26 +6021,26 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v4i8_to_v4i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT: s_lshr_b32 s4, s2, 16
-; GFX12-NEXT: s_lshr_b32 s6, s2, 24
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX12-NEXT: s_lshr_b32 s2, s0, 16
+; GFX12-NEXT: s_lshr_b32 s6, s0, 24
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x80000
; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 8
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v5, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s7
-; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v5, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v7, s7
+; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_mov_b32_e32 v6, s6
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6132,40 +6132,40 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v8i8_to_v8i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s3, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s5, s2, 24
-; GFX8-NOHSA-NEXT: s_and_b32 s6, s3, 0xff
-; GFX8-NOHSA-NEXT: s_and_b32 s7, s2, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s2
-; GFX8-NOHSA-NEXT: s_bfe_u32 s8, s2, 0x80010
-; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s3, 0x80010
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s3
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s1, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s0, 24
+; GFX8-NOHSA-NEXT: s_and_b32 s6, s1, 0xff
+; GFX8-NOHSA-NEXT: s_and_b32 s7, s0, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s0
+; GFX8-NOHSA-NEXT: s_bfe_u32 s8, s0, 0x80010
+; GFX8-NOHSA-NEXT: s_bfe_u32 s0, s1, 0x80010
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 32
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s7
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v7
-; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
@@ -6227,34 +6227,34 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v8i8_to_v8i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_bfe_u32 s4, s3, 0x80010
+; GFX12-NEXT: s_bfe_u32 s2, s1, 0x80010
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
-; GFX12-NEXT: s_lshr_b32 s5, s3, 24
-; GFX12-NEXT: v_lshrrev_b16 v4, 8, s2
-; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_lshr_b32 s4, s2, 24
-; GFX12-NEXT: s_bfe_u32 s5, s2, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: s_lshr_b32 s3, s1, 24
+; GFX12-NEXT: v_lshrrev_b16 v4, 8, s0
+; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: s_lshr_b32 s2, s0, 24
+; GFX12-NEXT: s_bfe_u32 s3, s0, 0x80010
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: v_mov_b32_e32 v0, s5
-; GFX12-NEXT: v_mov_b32_e32 v2, s4
-; GFX12-NEXT: s_and_b32 s2, s2, 0xff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: s_and_b32 s0, s0, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
-; GFX12-NEXT: s_and_b32 s2, s3, 0xff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: s_and_b32 s0, s1, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: v_mov_b32_e32 v2, v4
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:32
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6368,55 +6368,55 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NOHSA-NEXT: s_mov_b32 s5, 0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_mov_b32 s3, 0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s3, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s2, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s2, 24
-; GFX8-NOHSA-NEXT: s_mov_b32 s4, s3
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s2
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s3
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s1, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s0, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s0, 24
+; GFX8-NOHSA-NEXT: s_mov_b32 s2, s1
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s0
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s1
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[0:1], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GFX8-NOHSA-NEXT: s_ashr_i64 s[2:3], s[2:3], 56
+; GFX8-NOHSA-NEXT: s_ashr_i64 s[0:1], s[0:1], 56
; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 48
; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v1, 0, 8
; GFX8-NOHSA-NEXT: v_bfe_i32 v6, v0, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s9
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s11
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 32
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13
-; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
@@ -6479,40 +6479,40 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v8i8_to_v8i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s3, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v6, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 8, s3
-; GFX12-NEXT: s_lshr_b32 s6, s3, 16
-; GFX12-NEXT: s_lshr_b32 s8, s2, 16
-; GFX12-NEXT: s_lshr_b32 s10, s2, 24
+; GFX12-NEXT: v_lshrrev_b16 v6, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v7, 8, s1
+; GFX12-NEXT: s_lshr_b32 s6, s1, 16
+; GFX12-NEXT: s_lshr_b32 s8, s0, 16
+; GFX12-NEXT: s_lshr_b32 s10, s0, 24
; GFX12-NEXT: v_bfe_i32 v6, v6, 0, 8
-; GFX12-NEXT: s_mov_b32 s4, s3
+; GFX12-NEXT: s_mov_b32 s2, s1
; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
; GFX12-NEXT: v_bfe_i32 v14, v7, 0, 8
-; GFX12-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000
-; GFX12-NEXT: s_ashr_i64 s[2:3], s[2:3], 56
+; GFX12-NEXT: s_bfe_i64 s[12:13], s[0:1], 0x80000
+; GFX12-NEXT: s_ashr_i64 s[0:1], s[0:1], 56
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v9, s9
; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s11
-; GFX12-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v13, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s13
+; GFX12-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v13, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s13
; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v1, s7
; GFX12-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GFX12-NEXT: v_mov_b32_e32 v12, s4
+; GFX12-NEXT: v_mov_b32_e32 v12, s2
; GFX12-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1]
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5]
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5] offset:32
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6664,82 +6664,82 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
;
; GFX8-NOHSA-LABEL: constant_zextload_v16i8_to_v16i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s5, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s7, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s6, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s4, 24
-; GFX8-NOHSA-NEXT: s_and_b32 s11, s4, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s4
-; GFX8-NOHSA-NEXT: s_and_b32 s12, s5, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s5
-; GFX8-NOHSA-NEXT: s_and_b32 s13, s7, 0xff
-; GFX8-NOHSA-NEXT: s_and_b32 s14, s6, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s6
-; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s4, 0x80010
-; GFX8-NOHSA-NEXT: s_bfe_u32 s6, s6, 0x80010
-; GFX8-NOHSA-NEXT: s_bfe_u32 s5, s5, 0x80010
-; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s7, 0x80010
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x70
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x50
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s1, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s7, s3, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s2, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s0, 24
+; GFX8-NOHSA-NEXT: s_and_b32 s10, s0, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s0
+; GFX8-NOHSA-NEXT: s_and_b32 s11, s1, 0xff
+; GFX8-NOHSA-NEXT: s_and_b32 s12, s3, 0xff
+; GFX8-NOHSA-NEXT: s_and_b32 s13, s2, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s2
+; GFX8-NOHSA-NEXT: s_bfe_u32 s14, s0, 0x80010
+; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s2, 0x80010
+; GFX8-NOHSA-NEXT: s_bfe_u32 s15, s1, 0x80010
+; GFX8-NOHSA-NEXT: s_bfe_u32 s0, s3, 0x80010
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 0x70
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 0x50
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s15
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 64
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x60
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 64
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v9
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v8, 8, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 0x60
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v8
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v9
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v8, 8, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 32
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v8
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v7
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v16i8_to_v16i64:
@@ -6833,55 +6833,55 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v16i8_to_v16i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_bfe_u32 s2, s7, 0x80010
+; GFX12-NEXT: s_bfe_u32 s6, s3, 0x80010
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: s_lshr_b32 s3, s7, 24
-; GFX12-NEXT: s_lshr_b32 s2, s5, 24
-; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_bfe_u32 s3, s5, 0x80010
-; GFX12-NEXT: v_lshrrev_b16 v4, 8, s6
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s7
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: s_lshr_b32 s2, s6, 24
-; GFX12-NEXT: s_bfe_u32 s3, s6, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT: s_lshr_b32 s7, s3, 24
+; GFX12-NEXT: s_lshr_b32 s6, s1, 24
+; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: s_bfe_u32 s7, s1, 0x80010
+; GFX12-NEXT: v_lshrrev_b16 v4, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:112
+; GFX12-NEXT: v_mov_b32_e32 v0, s7
+; GFX12-NEXT: v_mov_b32_e32 v2, s6
+; GFX12-NEXT: s_lshr_b32 s6, s2, 24
+; GFX12-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: s_lshr_b32 s2, s4, 24
-; GFX12-NEXT: s_bfe_u32 s3, s4, 0x80010
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: s_and_b32 s2, s6, 0xff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: s_and_b32 s2, s2, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: v_mov_b32_e32 v0, s7
+; GFX12-NEXT: v_mov_b32_e32 v2, s6
+; GFX12-NEXT: s_lshr_b32 s6, s0, 24
+; GFX12-NEXT: s_bfe_u32 s7, s0, 0x80010
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:80
+; GFX12-NEXT: v_mov_b32_e32 v0, s7
+; GFX12-NEXT: v_mov_b32_e32 v2, s6
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:16
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s5
-; GFX12-NEXT: s_and_b32 s2, s7, 0xff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1
+; GFX12-NEXT: s_and_b32 s2, s3, 0xff
+; GFX12-NEXT: s_and_b32 s1, s1, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:64
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s4
-; GFX12-NEXT: s_and_b32 s2, s5, 0xff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s0
+; GFX12-NEXT: s_and_b32 s0, s0, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:96
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
-; GFX12-NEXT: s_and_b32 s2, s4, 0xff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:32
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: v_mov_b32_e32 v2, v4
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -7081,9 +7081,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
;
; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s11, 16
; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s10, 16
@@ -7092,16 +7092,16 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s8, 16
; GFX8-NOHSA-NEXT: s_lshr_b32 s22, s8, 24
; GFX8-NOHSA-NEXT: s_mov_b32 s24, s11
-; GFX8-NOHSA-NEXT: s_mov_b32 s4, s9
+; GFX8-NOHSA-NEXT: s_mov_b32 s2, s9
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s11
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s10
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s9
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s8
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[0:1], s[8:9], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[10:11], 0x80000
; GFX8-NOHSA-NEXT: s_ashr_i64 s[8:9], s[8:9], 56
; GFX8-NOHSA-NEXT: s_ashr_i64 s[10:11], s[10:11], 56
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
@@ -7110,18 +7110,18 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s10
-; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x70
+; GFX8-NOHSA-NEXT: s_add_u32 s10, s4, 0x70
; GFX8-NOHSA-NEXT: v_bfe_i32 v10, v1, 0, 8
; GFX8-NOHSA-NEXT: v_bfe_i32 v14, v0, 0, 8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s11
-; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s13
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11
-; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x50
+; GFX8-NOHSA-NEXT: s_add_u32 s10, s4, 0x50
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
-; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s14
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s15
@@ -7131,53 +7131,53 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s8
-; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 48
+; GFX8-NOHSA-NEXT: s_add_u32 s8, s4, 48
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s9
-; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s9, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s18
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s19
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
-; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 16
+; GFX8-NOHSA-NEXT: s_add_u32 s8, s4, 16
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
-; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s9, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s20
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s21
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s22
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s23
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
-; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 0x60
+; GFX8-NOHSA-NEXT: s_add_u32 s8, s4, 0x60
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
-; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s9, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s25
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s6
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 64
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s4, 64
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s7
-; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NOHSA-NEXT: v_bfe_i32 v6, v4, 0, 8
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 32
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 32
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v2, 0, 8
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -7282,44 +7282,44 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v16i8_to_v16i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[8:11], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v10, 8, s7
-; GFX12-NEXT: v_lshrrev_b16 v11, 8, s6
-; GFX12-NEXT: v_lshrrev_b16 v21, 8, s5
-; GFX12-NEXT: v_lshrrev_b16 v23, 8, s4
-; GFX12-NEXT: s_lshr_b32 s8, s7, 16
-; GFX12-NEXT: s_lshr_b32 s10, s6, 16
-; GFX12-NEXT: s_lshr_b32 s12, s6, 24
+; GFX12-NEXT: v_lshrrev_b16 v10, 8, s11
+; GFX12-NEXT: v_lshrrev_b16 v11, 8, s10
+; GFX12-NEXT: v_lshrrev_b16 v21, 8, s9
+; GFX12-NEXT: v_lshrrev_b16 v23, 8, s8
+; GFX12-NEXT: s_lshr_b32 s2, s11, 16
+; GFX12-NEXT: s_lshr_b32 s6, s10, 16
+; GFX12-NEXT: s_lshr_b32 s12, s10, 24
; GFX12-NEXT: v_bfe_i32 v22, v10, 0, 8
; GFX12-NEXT: v_bfe_i32 v10, v11, 0, 8
-; GFX12-NEXT: s_lshr_b32 s18, s4, 24
-; GFX12-NEXT: s_mov_b32 s20, s7
-; GFX12-NEXT: s_lshr_b32 s14, s5, 16
-; GFX12-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x80000
-; GFX12-NEXT: s_ashr_i64 s[6:7], s[6:7], 56
+; GFX12-NEXT: s_lshr_b32 s18, s8, 24
+; GFX12-NEXT: s_mov_b32 s20, s11
+; GFX12-NEXT: s_lshr_b32 s14, s9, 16
+; GFX12-NEXT: s_bfe_i64 s[24:25], s[10:11], 0x80000
+; GFX12-NEXT: s_ashr_i64 s[10:11], s[10:11], 56
; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
; GFX12-NEXT: v_bfe_i32 v28, v21, 0, 8
-; GFX12-NEXT: s_lshr_b32 s16, s4, 16
-; GFX12-NEXT: s_mov_b32 s22, s5
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000
-; GFX12-NEXT: s_ashr_i64 s[4:5], s[4:5], 56
+; GFX12-NEXT: s_lshr_b32 s16, s8, 16
+; GFX12-NEXT: s_mov_b32 s22, s9
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[8:9], 0x80000
+; GFX12-NEXT: s_ashr_i64 s[8:9], s[8:9], 56
; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v30, 0 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: v_dual_mov_b32 v30, 0 :: v_dual_mov_b32 v3, s11
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v7, s5
-; GFX12-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v13, s11
-; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v15, s13
+; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v7, s9
+; GFX12-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v13, s7
+; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v15, s13
; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v5, s15
; GFX12-NEXT: v_bfe_i32 v24, v23, 0, 8
; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v9, s25
+; GFX12-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v9, s25
; GFX12-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v21, s21
; GFX12-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v27, s23
; GFX12-NEXT: v_ashrrev_i32_e32 v23, 31, v22
@@ -7332,16 +7332,16 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v19, s19
; GFX12-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v30, v[0:3], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v30, v[20:23], s[0:1] offset:96
-; GFX12-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3
+; GFX12-NEXT: global_store_b128 v30, v[0:3], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v30, v[20:23], s[4:5] offset:96
+; GFX12-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1
; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v30, v[12:15], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v30, v[8:11], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v30, v[4:7], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v30, v[26:29], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v30, v[16:19], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v30, v[22:25], s[0:1]
+; GFX12-NEXT: global_store_b128 v30, v[12:15], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v30, v[8:11], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v30, v[4:7], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v30, v[26:29], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v30, v[16:19], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v30, v[22:25], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -7613,159 +7613,159 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
;
; GFX8-NOHSA-LABEL: constant_zextload_v32i8_to_v32i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s5, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s7, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s9, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s15, s11, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s10, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s17, s8, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s6, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s19, s4, 24
-; GFX8-NOHSA-NEXT: s_and_b32 s2, s4, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s4
-; GFX8-NOHSA-NEXT: s_and_b32 s3, s5, 0xff
-; GFX8-NOHSA-NEXT: s_and_b32 s20, s6, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s6
-; GFX8-NOHSA-NEXT: s_and_b32 s21, s7, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s7
-; GFX8-NOHSA-NEXT: s_and_b32 s22, s8, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v8, 8, s8
-; GFX8-NOHSA-NEXT: s_and_b32 s23, s9, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s9
-; GFX8-NOHSA-NEXT: s_and_b32 s24, s10, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v12, 8, s10
-; GFX8-NOHSA-NEXT: s_and_b32 s25, s11, 0xff
-; GFX8-NOHSA-NEXT: s_bfe_u32 s26, s4, 0x80010
-; GFX8-NOHSA-NEXT: s_bfe_u32 s6, s6, 0x80010
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s9, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s7, s11, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s13, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s15, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s17, s14, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s12, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s19, s10, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s8, 24
+; GFX8-NOHSA-NEXT: s_and_b32 s0, s8, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s8
+; GFX8-NOHSA-NEXT: s_and_b32 s1, s9, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s9
+; GFX8-NOHSA-NEXT: s_and_b32 s21, s10, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s10
+; GFX8-NOHSA-NEXT: s_and_b32 s22, s11, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s11
+; GFX8-NOHSA-NEXT: s_and_b32 s23, s12, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v8, 8, s12
+; GFX8-NOHSA-NEXT: s_and_b32 s24, s13, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s13
+; GFX8-NOHSA-NEXT: s_and_b32 s25, s14, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v12, 8, s14
+; GFX8-NOHSA-NEXT: s_and_b32 s26, s15, 0xff
; GFX8-NOHSA-NEXT: s_bfe_u32 s8, s8, 0x80010
; GFX8-NOHSA-NEXT: s_bfe_u32 s10, s10, 0x80010
-; GFX8-NOHSA-NEXT: s_bfe_u32 s27, s5, 0x80010
-; GFX8-NOHSA-NEXT: s_bfe_u32 s7, s7, 0x80010
+; GFX8-NOHSA-NEXT: s_bfe_u32 s12, s12, 0x80010
+; GFX8-NOHSA-NEXT: s_bfe_u32 s14, s14, 0x80010
; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s9, 0x80010
-; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s11, 0x80010
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xf0
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s5
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xb0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x70
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: s_bfe_u32 s11, s11, 0x80010
+; GFX8-NOHSA-NEXT: s_bfe_u32 s13, s13, 0x80010
+; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s15, 0x80010
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0xf0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0xb0
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 48
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s13
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0x70
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xd0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s27
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 48
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x90
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0xd0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x50
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0x90
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s17
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0x50
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xe0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s26
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s19
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v13, 8, s11
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xc0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s25
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0xe0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v13, 8, s15
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0xc0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s26
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v13
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xa0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0xa0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s25
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v12
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v9
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x80
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s23
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0x80
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x60
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0x60
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s23
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 64
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s21
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 64
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 32
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 32
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s21
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -8970,13 +8970,13 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt
;
; GFX8-NOHSA-LABEL: constant_zextload_i8_to_i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_short v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -9008,12 +9008,12 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_zextload_i8_to_i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -9056,13 +9056,13 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt
;
; GFX8-NOHSA-LABEL: constant_sextload_i8_to_i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_sbyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_short v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -9096,12 +9096,12 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_sextload_i8_to_i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_i8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_i8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -9144,13 +9144,13 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v1i8_to_v1i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_short v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -9182,12 +9182,12 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v1i8_to_v1i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -9230,13 +9230,13 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v1i8_to_v1i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_sbyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_short v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -9270,12 +9270,12 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v1i8_to_v1i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_i8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_i8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -9324,14 +9324,14 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e32 v4, 8, v2
; GFX8-NOHSA-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -9364,17 +9364,17 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v2i8_to_v2i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v2, 0xff, v1
; GFX12-NEXT: v_lshrrev_b16 v1, 8, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: v_lshl_or_b32 v1, v1, 16, v2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -9427,14 +9427,14 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v2i8_to_v2i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_and_b32_sdwa v3, v3, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NOHSA-NEXT: v_ashrrev_i16_e32 v2, 8, v2
@@ -9477,17 +9477,17 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v2i8_to_v2i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 8
; GFX12-NEXT: v_ashrrev_i16 v1, 8, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: v_lshl_or_b32 v1, v1, 16, v2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -9540,20 +9540,20 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v4i8_to_v4i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s2
-; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
-; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff
-; GFX8-NOHSA-NEXT: v_alignbit_b32 v3, s0, v3, 16
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s0, 24
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff
+; GFX8-NOHSA-NEXT: v_alignbit_b32 v3, s1, v3, 16
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v3
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s1, v2
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s0, v2
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -9606,23 +9606,23 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v4i8_to_v4i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v3, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s3, s2, 16
-; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s2
-; GFX12-NEXT: v_and_b32_e64 v1, 0xff, s3
-; GFX12-NEXT: v_lshrrev_b16 v2, 8, s2
-; GFX12-NEXT: s_lshr_b32 s2, s2, 24
+; GFX12-NEXT: s_lshr_b32 s1, s0, 16
+; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s0
+; GFX12-NEXT: v_and_b32_e64 v1, 0xff, s1
+; GFX12-NEXT: v_lshrrev_b16 v2, 8, s0
+; GFX12-NEXT: s_lshr_b32 s0, s0, 24
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_lshl_or_b32 v0, v2, 16, v0
-; GFX12-NEXT: v_lshl_or_b32 v1, s2, 16, v1
-; GFX12-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX12-NEXT: v_lshl_or_b32 v1, s0, 16, v1
+; GFX12-NEXT: global_store_b64 v3, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -9681,22 +9681,22 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v4i8_to_v4i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 16
-; GFX8-NOHSA-NEXT: s_ashr_i32 s1, s2, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s0, 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s2, 0x80000
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v2, 8, s2
-; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s0, 16
+; GFX8-NOHSA-NEXT: s_ashr_i32 s2, s0, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s1, 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s0, 0x80000
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v2, 8, s0
+; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s2, 16
+; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NOHSA-NEXT: s_and_b32 s2, 0xffff, s3
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NOHSA-NEXT: s_or_b32 s0, s0, s1
+; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0
; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s2, v2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -9757,22 +9757,22 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v4i8_to_v4i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_bfe_i32 s4, s2, 0x80000
-; GFX12-NEXT: s_lshr_b32 s3, s2, 16
-; GFX12-NEXT: v_ashrrev_i16 v0, 8, s2
-; GFX12-NEXT: v_and_b32_e64 v1, 0xffff, s4
-; GFX12-NEXT: s_ashr_i32 s2, s2, 24
-; GFX12-NEXT: s_bfe_i32 s3, s3, 0x80000
+; GFX12-NEXT: s_bfe_i32 s2, s0, 0x80000
+; GFX12-NEXT: s_lshr_b32 s1, s0, 16
+; GFX12-NEXT: v_ashrrev_i16 v0, 8, s0
+; GFX12-NEXT: v_and_b32_e64 v1, 0xffff, s2
+; GFX12-NEXT: s_ashr_i32 s0, s0, 24
+; GFX12-NEXT: s_bfe_i32 s1, s1, 0x80000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_pack_ll_b32_b16 s2, s3, s2
+; GFX12-NEXT: s_pack_ll_b32_b16 s0, s1, s0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -9843,29 +9843,29 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v8i8_to_v8i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s3, 24
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s2
-; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s3, 0x80010
-; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 16
-; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s0, v0, 16
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s3
-; GFX8-NOHSA-NEXT: s_or_b32 s0, s4, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s0, 24
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s1, 24
+; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s2, v0, 16
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s1
+; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s1, 0x80010
+; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16
; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
-; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff
+; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s1, v0
-; GFX8-NOHSA-NEXT: s_and_b32 s1, s3, 0xff
+; GFX8-NOHSA-NEXT: s_or_b32 s2, s4, s3
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX8-NOHSA-NEXT: s_and_b32 s0, s1, 0xff
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s1, v2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s0, v2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -9948,30 +9948,30 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v8i8_to_v8i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s5, s2, 16
-; GFX12-NEXT: s_lshr_b32 s6, s3, 16
-; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s2
-; GFX12-NEXT: v_and_b32_e64 v2, 0xff, s3
+; GFX12-NEXT: s_lshr_b32 s3, s0, 16
+; GFX12-NEXT: s_lshr_b32 s6, s1, 16
+; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s0
+; GFX12-NEXT: v_and_b32_e64 v2, 0xff, s1
; GFX12-NEXT: v_and_b32_e64 v3, 0xff, s6
-; GFX12-NEXT: v_and_b32_e64 v5, 0xff, s5
+; GFX12-NEXT: v_and_b32_e64 v5, 0xff, s3
; GFX12-NEXT: v_mov_b32_e32 v4, 0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s3
-; GFX12-NEXT: v_lshrrev_b16 v6, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s1
+; GFX12-NEXT: v_lshrrev_b16 v6, 8, s0
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: s_lshr_b32 s4, s2, 24
-; GFX12-NEXT: s_lshr_b32 s2, s3, 24
+; GFX12-NEXT: s_lshr_b32 s2, s0, 24
+; GFX12-NEXT: s_lshr_b32 s0, s1, 24
; GFX12-NEXT: v_lshl_or_b32 v0, v6, 16, v0
; GFX12-NEXT: v_lshl_or_b32 v2, v1, 16, v2
-; GFX12-NEXT: v_lshl_or_b32 v3, s2, 16, v3
-; GFX12-NEXT: v_lshl_or_b32 v1, s4, 16, v5
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_lshl_or_b32 v3, s0, 16, v3
+; GFX12-NEXT: v_lshl_or_b32 v1, s2, 16, v5
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -10054,36 +10054,36 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_bfe_i32 s6, s2, 0x80000
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v0, 8, s2
-; GFX8-NOHSA-NEXT: s_ashr_i64 s[0:1], s[2:3], 56
-; GFX8-NOHSA-NEXT: s_lshr_b32 s5, s3, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s6
+; GFX8-NOHSA-NEXT: s_bfe_i32 s6, s0, 0x80000
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v0, 8, s0
+; GFX8-NOHSA-NEXT: s_ashr_i64 s[2:3], s[0:1], 56
+; GFX8-NOHSA-NEXT: s_lshr_b32 s5, s1, 16
+; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s6
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s1, v0
-; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s5, 0x80000
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s3, v0
+; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s5, 0x80000
+; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s0, 16
+; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 16
+; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NOHSA-NEXT: s_or_b32 s2, s3, s2
+; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s1, 0x80000
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v1, 8, s1
+; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s0, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s4, 0x80000
; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16
; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0
-; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s3, 0x80000
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v1, 8, s3
-; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s2, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s1, v1
-; GFX8-NOHSA-NEXT: s_ashr_i32 s1, s2, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s2, s4, 0x80000
-; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NOHSA-NEXT: s_or_b32 s1, s2, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s3, v1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -10179,29 +10179,29 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v8i8_to_v8i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_bfe_i32 s8, s2, 0x80000
-; GFX12-NEXT: s_bfe_i32 s9, s3, 0x80000
-; GFX12-NEXT: s_lshr_b32 s6, s2, 16
-; GFX12-NEXT: s_lshr_b32 s7, s3, 16
-; GFX12-NEXT: v_ashrrev_i16 v0, 8, s2
-; GFX12-NEXT: v_ashrrev_i16 v2, 8, s3
-; GFX12-NEXT: s_ashr_i64 s[4:5], s[2:3], 56
+; GFX12-NEXT: s_bfe_i32 s8, s0, 0x80000
+; GFX12-NEXT: s_bfe_i32 s9, s1, 0x80000
+; GFX12-NEXT: s_lshr_b32 s6, s0, 16
+; GFX12-NEXT: s_lshr_b32 s7, s1, 16
+; GFX12-NEXT: v_ashrrev_i16 v0, 8, s0
+; GFX12-NEXT: v_ashrrev_i16 v2, 8, s1
+; GFX12-NEXT: s_ashr_i64 s[2:3], s[0:1], 56
; GFX12-NEXT: v_and_b32_e64 v3, 0xffff, s8
; GFX12-NEXT: v_and_b32_e64 v5, 0xffff, s9
-; GFX12-NEXT: s_ashr_i32 s2, s2, 24
-; GFX12-NEXT: s_bfe_i32 s3, s6, 0x80000
-; GFX12-NEXT: s_bfe_i32 s5, s7, 0x80000
-; GFX12-NEXT: s_pack_ll_b32_b16 s2, s3, s2
-; GFX12-NEXT: s_pack_ll_b32_b16 s3, s5, s4
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: s_ashr_i32 s0, s0, 24
+; GFX12-NEXT: s_bfe_i32 s1, s6, 0x80000
+; GFX12-NEXT: s_bfe_i32 s3, s7, 0x80000
+; GFX12-NEXT: s_pack_ll_b32_b16 s0, s1, s0
+; GFX12-NEXT: s_pack_ll_b32_b16 s1, s3, s2
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s0
; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v3
; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v5
-; GFX12-NEXT: v_mov_b32_e32 v3, s3
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v3, s1
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -10314,53 +10314,53 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
;
; GFX8-NOHSA-LABEL: constant_zextload_v16i8_to_v16i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s4
-; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s4, 24
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s4
-; GFX8-NOHSA-NEXT: v_alignbit_b32 v1, s3, v1, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s3, s4, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s7, s0, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s1, 24
+; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s1, 0x80010
+; GFX8-NOHSA-NEXT: s_and_b32 s10, s1, 0xff
+; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s3, v0
-; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s7, 24
-; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16
-; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s7, 0x80010
-; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s5, 24
-; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s5, 0x80010
-; GFX8-NOHSA-NEXT: s_and_b32 s10, s5, 0xff
-; GFX8-NOHSA-NEXT: s_lshl_b32 s5, s5, 8
-; GFX8-NOHSA-NEXT: s_or_b32 s3, s4, s3
-; GFX8-NOHSA-NEXT: s_and_b32 s4, s7, 0xff
-; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 8
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s6
-; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s6, 24
; GFX8-NOHSA-NEXT: s_lshl_b32 s8, s8, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s5, s5, 0xff0000
-; GFX8-NOHSA-NEXT: s_and_b32 s7, s7, 0xff0000
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s3, 24
; GFX8-NOHSA-NEXT: s_or_b32 s8, s9, s8
-; GFX8-NOHSA-NEXT: s_or_b32 s5, s10, s5
-; GFX8-NOHSA-NEXT: s_or_b32 s4, s4, s7
-; GFX8-NOHSA-NEXT: v_alignbit_b32 v3, s2, v3, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s2, s6, 0xff
+; GFX8-NOHSA-NEXT: s_or_b32 s9, s10, s1
+; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16
+; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s3, 0x80010
+; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NOHSA-NEXT: s_and_b32 s1, s3, 0xff
+; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 8
+; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s2
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s2, 24
+; GFX8-NOHSA-NEXT: s_or_b32 s1, s1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NOHSA-NEXT: s_and_b32 s2, s2, 0xff
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_alignbit_b32 v3, s6, v3, 16
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s2, v2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1
; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s2, v2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0
+; GFX8-NOHSA-NEXT: v_alignbit_b32 v1, s7, v1, 16
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -10509,27 +10509,27 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v16i8_to_v16i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s3, s6, 16
-; GFX12-NEXT: s_lshr_b32 s9, s7, 16
-; GFX12-NEXT: s_lshr_b32 s11, s4, 16
-; GFX12-NEXT: s_lshr_b32 s13, s5, 16
-; GFX12-NEXT: v_and_b32_e64 v4, 0xff, s5
-; GFX12-NEXT: v_and_b32_e64 v5, 0xff, s4
-; GFX12-NEXT: v_and_b32_e64 v6, 0xff, s7
-; GFX12-NEXT: v_and_b32_e64 v7, 0xff, s6
+; GFX12-NEXT: s_lshr_b32 s7, s2, 16
+; GFX12-NEXT: s_lshr_b32 s9, s3, 16
+; GFX12-NEXT: s_lshr_b32 s11, s0, 16
+; GFX12-NEXT: s_lshr_b32 s13, s1, 16
+; GFX12-NEXT: v_and_b32_e64 v4, 0xff, s1
+; GFX12-NEXT: v_and_b32_e64 v5, 0xff, s0
+; GFX12-NEXT: v_and_b32_e64 v6, 0xff, s3
+; GFX12-NEXT: v_and_b32_e64 v7, 0xff, s2
; GFX12-NEXT: v_and_b32_e64 v11, 0xff, s9
-; GFX12-NEXT: v_and_b32_e64 v12, 0xff, s3
+; GFX12-NEXT: v_and_b32_e64 v12, 0xff, s7
; GFX12-NEXT: v_and_b32_e64 v9, 0xff, s13
; GFX12-NEXT: v_and_b32_e64 v10, 0xff, s11
; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s6
-; GFX12-NEXT: v_lshrrev_b16 v3, 8, s7
-; GFX12-NEXT: v_lshrrev_b16 v0, 8, s4
-; GFX12-NEXT: v_lshrrev_b16 v2, 8, s5
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v3, 8, s3
+; GFX12-NEXT: v_lshrrev_b16 v0, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v2, 8, s1
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX12-NEXT: v_and_b32_e32 v7, 0xffff, v7
@@ -10537,21 +10537,21 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX12-NEXT: s_lshr_b32 s2, s6, 24
-; GFX12-NEXT: s_lshr_b32 s8, s7, 24
-; GFX12-NEXT: s_lshr_b32 s10, s4, 24
-; GFX12-NEXT: s_lshr_b32 s12, s5, 24
+; GFX12-NEXT: s_lshr_b32 s6, s2, 24
+; GFX12-NEXT: s_lshr_b32 s8, s3, 24
+; GFX12-NEXT: s_lshr_b32 s10, s0, 24
+; GFX12-NEXT: s_lshr_b32 s12, s1, 24
; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v4
; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v5
; GFX12-NEXT: v_lshl_or_b32 v6, v3, 16, v6
; GFX12-NEXT: v_lshl_or_b32 v4, v1, 16, v7
; GFX12-NEXT: v_lshl_or_b32 v7, s8, 16, v11
-; GFX12-NEXT: v_lshl_or_b32 v5, s2, 16, v12
+; GFX12-NEXT: v_lshl_or_b32 v5, s6, 16, v12
; GFX12-NEXT: v_lshl_or_b32 v3, s12, 16, v9
; GFX12-NEXT: v_lshl_or_b32 v1, s10, 16, v10
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -10689,62 +10689,63 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
;
; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_bfe_i32 s10, s5, 0x80000
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v0, 8, s5
-; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s5, 16
+; GFX8-NOHSA-NEXT: s_bfe_i32 s10, s1, 0x80000
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v0, 8, s1
+; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s1, 16
; GFX8-NOHSA-NEXT: s_and_b32 s10, 0xffff, s10
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s4, 16
-; GFX8-NOHSA-NEXT: s_bfe_i32 s5, s4, 0x80000
+; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s0, 16
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s0, 0x80000
; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s10, v0
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v0, 8, s4
-; GFX8-NOHSA-NEXT: s_bfe_i32 s4, s3, 0x80000
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v1, 8, s3
-; GFX8-NOHSA-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v0, 8, s0
+; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s9, 0x80000
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v1, 8, s9
+; GFX8-NOHSA-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v3, s4, v1
-; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s2, 0x80000
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v1, 8, s2
-; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v3, s0, v1
+; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s8, 0x80000
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v1, 8, s8
+; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NOHSA-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s7, 16
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v1, s3, v1
-; GFX8-NOHSA-NEXT: s_ashr_i64 s[2:3], s[6:7], 56
-; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s9, 0x80000
-; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NOHSA-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s7, 0x80000
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v4, 8, s7
-; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NOHSA-NEXT: s_lshr_b32 s7, s3, 16
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s1, v0
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v1, s0, v1
+; GFX8-NOHSA-NEXT: s_ashr_i64 s[0:1], s[2:3], 56
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s7, 0x80000
+; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16
+; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s3, 0x80000
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v4, 8, s3
+; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v6, s3, v4
-; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s6, 0x80000
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v4, 8, s6
-; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s6, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v6, s1, v4
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s2, 0x80000
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v4, 8, s2
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s2, 16
+; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v4, s3, v4
-; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s8, 0x80000
-; GFX8-NOHSA-NEXT: s_and_b32 s5, 0xffff, s5
-; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v5, 8, s8
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v4, s1, v4
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s6, 0x80000
+; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v5, 8, s6
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v5, s3, v5
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s2
-; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v5, s1, v5
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s5, v0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_nop 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -10920,40 +10921,40 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v16i8_to_v16i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s8, s6, 16
-; GFX12-NEXT: v_ashrrev_i16 v5, 8, s6
+; GFX12-NEXT: s_lshr_b32 s6, s2, 16
+; GFX12-NEXT: v_ashrrev_i16 v5, 8, s2
+; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80000
+; GFX12-NEXT: s_lshr_b32 s8, s0, 16
+; GFX12-NEXT: s_lshr_b32 s9, s1, 16
+; GFX12-NEXT: v_ashrrev_i16 v0, 8, s1
+; GFX12-NEXT: s_bfe_i32 s10, s1, 0x80000
+; GFX12-NEXT: v_ashrrev_i16 v1, 8, s0
+; GFX12-NEXT: s_bfe_i32 s11, s0, 0x80000
+; GFX12-NEXT: s_bfe_i32 s12, s3, 0x80000
+; GFX12-NEXT: s_ashr_i64 s[0:1], s[2:3], 56
+; GFX12-NEXT: v_ashrrev_i16 v13, 8, s6
; GFX12-NEXT: s_bfe_i32 s6, s6, 0x80000
-; GFX12-NEXT: s_lshr_b32 s10, s4, 16
-; GFX12-NEXT: s_lshr_b32 s11, s5, 16
-; GFX12-NEXT: v_ashrrev_i16 v1, 8, s4
-; GFX12-NEXT: s_bfe_i32 s4, s4, 0x80000
-; GFX12-NEXT: v_ashrrev_i16 v0, 8, s5
-; GFX12-NEXT: s_bfe_i32 s5, s5, 0x80000
-; GFX12-NEXT: s_bfe_i32 s12, s7, 0x80000
-; GFX12-NEXT: s_ashr_i64 s[2:3], s[6:7], 56
-; GFX12-NEXT: v_and_b32_e64 v12, 0xffff, s6
-; GFX12-NEXT: s_bfe_i32 s6, s8, 0x80000
-; GFX12-NEXT: s_lshr_b32 s9, s7, 16
-; GFX12-NEXT: v_and_b32_e64 v7, 0xffff, s4
-; GFX12-NEXT: s_bfe_i32 s3, s11, 0x80000
-; GFX12-NEXT: s_bfe_i32 s4, s10, 0x80000
-; GFX12-NEXT: v_ashrrev_i16 v2, 8, s7
-; GFX12-NEXT: v_and_b32_e64 v4, 0xffff, s5
+; GFX12-NEXT: s_lshr_b32 s7, s3, 16
+; GFX12-NEXT: v_ashrrev_i16 v2, 8, s3
+; GFX12-NEXT: s_bfe_i32 s1, s9, 0x80000
+; GFX12-NEXT: s_bfe_i32 s3, s8, 0x80000
+; GFX12-NEXT: v_and_b32_e64 v4, 0xffff, s10
+; GFX12-NEXT: v_and_b32_e64 v7, 0xffff, s11
; GFX12-NEXT: v_and_b32_e64 v11, 0xffff, s12
-; GFX12-NEXT: v_ashrrev_i16 v13, 8, s8
+; GFX12-NEXT: v_and_b32_e64 v12, 0xffff, s2
; GFX12-NEXT: v_and_b32_e64 v16, 0xffff, s6
-; GFX12-NEXT: v_ashrrev_i16 v9, 8, s11
-; GFX12-NEXT: v_ashrrev_i16 v10, 8, s10
-; GFX12-NEXT: s_bfe_i32 s5, s9, 0x80000
-; GFX12-NEXT: v_and_b32_e64 v14, 0xffff, s3
-; GFX12-NEXT: v_and_b32_e64 v15, 0xffff, s4
-; GFX12-NEXT: s_pack_ll_b32_b16 s2, s5, s2
+; GFX12-NEXT: v_ashrrev_i16 v9, 8, s9
+; GFX12-NEXT: v_ashrrev_i16 v10, 8, s8
+; GFX12-NEXT: s_bfe_i32 s2, s7, 0x80000
+; GFX12-NEXT: v_and_b32_e64 v14, 0xffff, s1
+; GFX12-NEXT: v_and_b32_e64 v15, 0xffff, s3
+; GFX12-NEXT: s_pack_ll_b32_b16 s0, s2, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s2
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s0
; GFX12-NEXT: v_lshl_or_b32 v6, v0, 16, v4
; GFX12-NEXT: v_lshl_or_b32 v4, v1, 16, v7
; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v11
@@ -10962,8 +10963,8 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX12-NEXT: v_lshl_or_b32 v7, v9, 16, v14
; GFX12-NEXT: v_lshl_or_b32 v5, v10, 16, v15
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 21e27bf..8a40901 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -40,19 +40,19 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace(
;
; GCN-NOHSA-VI-LABEL: global_load_i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_i16:
@@ -145,19 +145,19 @@ define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspac
;
; GCN-NOHSA-VI-LABEL: global_load_v2i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_v2i16:
@@ -236,20 +236,20 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac
;
; GCN-NOHSA-VI-LABEL: global_load_v3i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_v3i16:
@@ -362,19 +362,19 @@ define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspac
;
; GCN-NOHSA-VI-LABEL: global_load_v4i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_v4i16:
@@ -447,19 +447,19 @@ define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspac
;
; GCN-NOHSA-VI-LABEL: global_load_v8i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_v8i16:
@@ -546,22 +546,22 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa
;
; GCN-NOHSA-VI-LABEL: global_load_v16i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_v16i16:
@@ -696,30 +696,30 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a
;
; GCN-NOHSA-VI-LABEL: global_load_v16i16_align2:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 offset:14
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:10
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:6
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v3, off, s[4:7], 0 offset:2
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v4, off, s[4:7], 0 offset:30
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v5, off, s[4:7], 0 offset:26
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v6, off, s[4:7], 0 offset:22
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v7, off, s[4:7], 0 offset:18
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v8, off, s[4:7], 0 offset:12
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v9, off, s[4:7], 0 offset:8
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v10, off, s[4:7], 0 offset:4
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v11, off, s[4:7], 0
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v12, off, s[4:7], 0 offset:28
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v13, off, s[4:7], 0 offset:24
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v14, off, s[4:7], 0 offset:20
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v15, off, s[4:7], 0 offset:16
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:14
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:10
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:6
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:2
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:30
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:26
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v6, off, s[0:3], 0 offset:22
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v7, off, s[0:3], 0 offset:18
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v8, off, s[0:3], 0 offset:12
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v9, off, s[0:3], 0 offset:8
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v10, off, s[0:3], 0 offset:4
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v11, off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v12, off, s[0:3], 0 offset:28
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v13, off, s[0:3], 0 offset:24
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v14, off, s[0:3], 0 offset:20
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v15, off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s7
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(14)
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -751,8 +751,8 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a
; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v5, v14, v18
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v4, v15, v19
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_v16i16_align2:
@@ -834,19 +834,19 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr
;
; GCN-NOHSA-VI-LABEL: global_zextload_i16_to_i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_i16_to_i32:
@@ -919,19 +919,19 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr
;
; GCN-NOHSA-VI-LABEL: global_sextload_i16_to_i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_sshort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_i16_to_i32:
@@ -1007,19 +1007,19 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_zextload_v1i16_to_v1i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v1i16_to_v1i32:
@@ -1092,19 +1092,19 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v1i16_to_v1i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_sshort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v1i16_to_v1i32:
@@ -1184,21 +1184,21 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_zextload_v2i16_to_v2i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v2i16_to_v2i32:
@@ -1283,21 +1283,21 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v2i16_to_v2i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v0
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v2i16_to_v2i32:
@@ -1385,22 +1385,22 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_zextload_v3i16_to_v3i32:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v1
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v3i16_to_v3i32:
@@ -1495,22 +1495,22 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v3i16_to_v3i32:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[3:4], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v3
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v4, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v3, 0, 16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v3i16_to_v3i32:
@@ -1613,23 +1613,23 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_zextload_v4i16_to_v4i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v1
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v4i16_to_v4i32:
@@ -1729,23 +1729,23 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v5
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v4
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v5, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v4, 0, 16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v4i16_to_v4i32:
@@ -1859,17 +1859,17 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_zextload_v8i16_to_v8i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v3
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v3
@@ -1879,8 +1879,8 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v1
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v8i16_to_v8i32:
@@ -2008,17 +2008,17 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v8i16_to_v8i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v3
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v2
@@ -2028,8 +2028,8 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v0
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v0, 0, 16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v8i16_to_v8i32:
@@ -5158,21 +5158,21 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr
;
; GCN-NOHSA-VI-LABEL: global_zextload_i16_to_i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_i16_to_i64:
@@ -5255,21 +5255,21 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr
;
; GCN-NOHSA-VI-LABEL: global_sextload_i16_to_i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_i16_to_i64:
@@ -5350,21 +5350,21 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_zextload_v1i16_to_v1i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v1i16_to_v1i64:
@@ -5442,21 +5442,21 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v1i16_to_v1i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v1i16_to_v1i64:
@@ -5543,23 +5543,23 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_zextload_v2i16_to_v2i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v2i16_to_v2i64:
@@ -5653,24 +5653,24 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v2i16_to_v2i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v2, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v2i16_to_v2i64:
@@ -5779,19 +5779,19 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_zextload_v4i16_to_v4i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v1
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
@@ -5799,8 +5799,8 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v9
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v8
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v8
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v4i16_to_v4i64:
@@ -5925,17 +5925,17 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[1:2], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v2
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
@@ -5948,8 +5948,8 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v4i16_to_v4i64:
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 0f9cc33..121c436 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -39,19 +39,19 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(
;
; GCNX3-NOHSA-LABEL: global_load_i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
-; GCNX3-NOHSA-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_i32:
@@ -118,19 +118,19 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac
;
; GCNX3-NOHSA-LABEL: global_load_v2i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v2i32:
@@ -198,19 +198,19 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac
;
; GCNX3-NOHSA-LABEL: global_load_v3i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v3i32:
@@ -282,19 +282,19 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac
;
; GCNX3-NOHSA-LABEL: global_load_v4i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v4i32:
@@ -375,22 +375,22 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac
;
; GCNX3-NOHSA-LABEL: global_load_v8i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v8i32:
@@ -492,25 +492,25 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac
;
; GCNX3-NOHSA-LABEL: global_load_v9i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:32
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v9i32:
@@ -623,25 +623,25 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa
;
; GCNX3-NOHSA-LABEL: global_load_v10i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 offset:32
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v10i32:
@@ -753,25 +753,25 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa
;
; GCNX3-NOHSA-LABEL: global_load_v11i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: buffer_load_dwordx3 v[8:10], off, s[8:11], 0 offset:32
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[8:10], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v11i32:
@@ -888,25 +888,25 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa
;
; GCNX3-NOHSA-LABEL: global_load_v12i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v12i32:
@@ -1032,28 +1032,28 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa
;
; GCNX3-NOHSA-LABEL: global_load_v16i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:48
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v16i32:
@@ -1147,20 +1147,20 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr
;
; GCNX3-NOHSA-LABEL: global_zextload_i32_to_i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_i32_to_i64:
@@ -1230,20 +1230,20 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr
;
; GCNX3-NOHSA-LABEL: global_sextload_i32_to_i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_i32_to_i64:
@@ -1314,20 +1314,20 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out,
;
; GCNX3-NOHSA-LABEL: global_zextload_v1i32_to_v1i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v1i32_to_v1i64:
@@ -1397,20 +1397,20 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out,
;
; GCNX3-NOHSA-LABEL: global_sextload_v1i32_to_v1i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v1i32_to_v1i64:
@@ -1487,23 +1487,23 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out,
;
; GCNX3-NOHSA-LABEL: global_zextload_v2i32_to_v2i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v1, 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v0, v2
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v3
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v3, v1
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v2i32_to_v2i64:
@@ -1583,22 +1583,22 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out,
;
; GCNX3-NOHSA-LABEL: global_sextload_v2i32_to_v2i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v1
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v1
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v2i32_to_v2i64:
@@ -1694,27 +1694,27 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out,
;
; GCNX3-NOHSA-LABEL: global_zextload_v4i32_to_v4i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v5, 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v7, v5
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v2
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v3
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_nop 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v1
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v4i32_to_v4i64:
@@ -1821,17 +1821,17 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out,
;
; GCNX3-NOHSA-LABEL: global_sextload_v4i32_to_v4i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v10, 31, v3
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v8, 31, v2
@@ -1841,8 +1841,8 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out,
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v4, 31, v0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v3, v0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v5, v1
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 offset:16
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v4i32_to_v4i64:
@@ -1981,36 +1981,36 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
;
; GCNX3-NOHSA-LABEL: global_zextload_v8i32_to_v8i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v9, 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v11, v9
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v2
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v3
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
; GCNX3-NOHSA-NEXT: s_nop 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v1
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v6
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v7
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_nop 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v4
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v5
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v8i32_to_v8i64:
@@ -4515,14 +4515,14 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
;
; GCNX3-NOHSA-LABEL: global_load_v32i32:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:112
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80
@@ -4531,22 +4531,22 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:48
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:96
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:112
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:64
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:80
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:64
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:80
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 offset:48
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v32i32:
diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
index f19eeee..76d5268 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
@@ -4,25 +4,25 @@
define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) {
; GCN-LABEL: copy_flat:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x34
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_cmp_eq_u32 s4, 0
+; GCN-NEXT: s_cmp_eq_u32 s2, 0
; GCN-NEXT: s_cbranch_scc1 .LBB0_3
; GCN-NEXT: ; %bb.1: ; %for.body.preheader
-; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
+; GCN-NEXT: s_add_nc_u64 s[0:1], s[6:7], 0xb0
; GCN-NEXT: .LBB0_2: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GCN-NEXT: s_prefetch_data s[2:3], 0x0, null, 0
-; GCN-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
-; GCN-NEXT: s_add_co_i32 s4, s4, -1
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GCN-NEXT: s_add_co_i32 s2, s2, -1
; GCN-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176
-; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
+; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_add_nc_u64 s[4:5], s[4:5], 16
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: flat_store_b128 v[4:5], v[0:3]
; GCN-NEXT: s_cbranch_scc1 .LBB0_2
@@ -50,25 +50,25 @@ for.end: ; preds = %for.body, %entry
define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrspace(1) nocapture readonly %s, i32 %n) {
; GCN-LABEL: copy_global:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x34
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_cmp_eq_u32 s4, 0
+; GCN-NEXT: s_cmp_eq_u32 s2, 0
; GCN-NEXT: s_cbranch_scc1 .LBB1_3
; GCN-NEXT: ; %bb.1: ; %for.body.preheader
-; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
+; GCN-NEXT: s_add_nc_u64 s[0:1], s[6:7], 0xb0
; GCN-NEXT: .LBB1_2: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: global_load_b128 v[1:4], v0, s[2:3] offset:-176
-; GCN-NEXT: s_prefetch_data s[2:3], 0x0, null, 0
-; GCN-NEXT: s_add_co_i32 s4, s4, -1
-; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: global_store_b128 v0, v[1:4], s[0:1]
+; GCN-NEXT: global_load_b128 v[1:4], v0, s[0:1] offset:-176
+; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GCN-NEXT: s_add_co_i32 s2, s2, -1
; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
+; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_store_b128 v0, v[1:4], s[4:5]
+; GCN-NEXT: s_add_nc_u64 s[4:5], s[4:5], 16
; GCN-NEXT: s_cbranch_scc1 .LBB1_2
; GCN-NEXT: .LBB1_3: ; %for.end
; GCN-NEXT: s_nop 0
@@ -96,26 +96,26 @@ for.end: ; preds = %for.body, %entry
define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addrspace(4) nocapture readonly %s, i32 %n) {
; GCN-LABEL: copy_constant:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x34
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_cmp_eq_u32 s4, 0
+; GCN-NEXT: s_cmp_eq_u32 s2, 0
; GCN-NEXT: s_cbranch_scc1 .LBB2_3
; GCN-NEXT: ; %bb.1: ; %for.body.preheader
-; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: .LBB2_2: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_load_b128 s[8:11], s[2:3], 0x0
-; GCN-NEXT: s_prefetch_data s[2:3], 0xb0, null, 0
-; GCN-NEXT: s_add_co_i32 s4, s4, -1
-; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
+; GCN-NEXT: s_load_b128 s[8:11], s[6:7], 0x0
+; GCN-NEXT: s_prefetch_data s[6:7], 0xb0, null, 0
+; GCN-NEXT: s_add_co_i32 s2, s2, -1
+; GCN-NEXT: s_add_nc_u64 s[6:7], s[6:7], 16
+; GCN-NEXT: s_cmp_lg_u32 s2, 0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9
; GCN-NEXT: v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11
-; GCN-NEXT: global_store_b128 v0, v[1:4], s[0:1]
-; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
+; GCN-NEXT: global_store_b128 v0, v[1:4], s[4:5]
+; GCN-NEXT: s_add_nc_u64 s[4:5], s[4:5], 16
; GCN-NEXT: s_cbranch_scc1 .LBB2_2
; GCN-NEXT: .LBB2_3: ; %for.end
; GCN-NEXT: s_nop 0
@@ -143,20 +143,20 @@ for.end: ; preds = %for.body, %entry
define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspace(3) nocapture readonly %s, i32 %n) {
; GCN-LABEL: copy_local:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GCN-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_cmp_eq_u32 s2, 0
+; GCN-NEXT: s_cmp_eq_u32 s6, 0
; GCN-NEXT: s_cbranch_scc1 .LBB3_2
; GCN-NEXT: .LBB3_1: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s0
-; GCN-NEXT: s_add_co_i32 s2, s2, -1
-; GCN-NEXT: s_add_co_i32 s0, s0, 16
-; GCN-NEXT: s_add_co_i32 s1, s1, 16
+; GCN-NEXT: v_mov_b32_e32 v2, s5
+; GCN-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NEXT: s_add_co_i32 s6, s6, -1
+; GCN-NEXT: s_add_co_i32 s4, s4, 16
+; GCN-NEXT: s_add_co_i32 s5, s5, 16
; GCN-NEXT: ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3
; GCN-NEXT: ds_load_2addr_b32 v[2:3], v2 offset1:1
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_wait_dscnt 0x1
; GCN-NEXT: ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3
; GCN-NEXT: s_wait_dscnt 0x1
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
index cb3ea2e..ad4af2f 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
@@ -24,13 +24,13 @@ define protected amdgpu_kernel void @test(ptr addrspace(1) nocapture %ptr.coerce
; GCN-NEXT: ds_write_b8 v0, v1
; GCN-NEXT: ds_read_u8 v2, v0 offset:2
; GCN-NEXT: ds_read_u16 v3, v0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b8 v0, v2 offset:6
; GCN-NEXT: ds_write_b16 v0, v3 offset:4
-; GCN-NEXT: v_cmp_eq_u16_sdwa s[2:3], v3, v1 src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
-; GCN-NEXT: global_store_byte v0, v1, s[0:1]
+; GCN-NEXT: v_cmp_eq_u16_sdwa s[0:1], v3, v1 src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GCN-NEXT: global_store_byte v0, v1, s[2:3]
; GCN-NEXT: s_endpgm
; CHECK-LABEL: define protected amdgpu_kernel void @test(
; CHECK-SAME: ptr addrspace(1) nocapture [[PTR_COERCE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
index c6a734a..32318ab 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -O3 < %s | FileCheck -check-prefix=GCN %s
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
@@ -37,10 +38,10 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x2(ptr addrspace(1) %arg, i
; GCN-NEXT: ds_write_b32 v1, v2 offset:256
; GCN-NEXT: ds_read_b32 v2, v0
; GCN-NEXT: ds_read_b32 v0, v0 offset:256
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_u32_e32 v0, v2, v0
-; GCN-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
store i32 1, ptr addrspace(3) @a, align 4
@@ -88,11 +89,11 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x3(ptr addrspace(1) %arg, i
; GCN-NEXT: ds_read_b32 v2, v0
; GCN-NEXT: ds_read_b32 v3, v0 offset:256
; GCN-NEXT: ds_read_b32 v0, v0 offset:512
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_u32_e32 v2, v2, v3
; GCN-NEXT: v_add_u32_e32 v0, v2, v0
-; GCN-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
store i32 1, ptr addrspace(3) @a, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
index e9a1b38..b11cd19 100644
--- a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
@@ -74,7 +74,7 @@ define i64 @add_u64_vv(i64 %v, i64 %a) {
define amdgpu_kernel void @add_u64_sv(i64 %v) {
; GCN-LABEL: add_u64_sv:
-; GCN: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GCN: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
%a = load i64, ptr undef
%add = add i64 %v, %a
store i64 %add, ptr undef
@@ -83,7 +83,7 @@ define amdgpu_kernel void @add_u64_sv(i64 %v) {
define amdgpu_kernel void @add_u64_vs(i64 %a) {
; GCN-LABEL: add_u64_vs:
-; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
%v = load i64, ptr undef
%add = add i64 %v, %a
store i64 %add, ptr undef
@@ -93,7 +93,7 @@ define amdgpu_kernel void @add_u64_vs(i64 %a) {
define amdgpu_kernel void @add_u64_ss(i64 %v, i64 %a) {
; GCN-LABEL: add_u64_ss:
; GCN: s_add_u32
-; GCN: s_addc_u32 s1, s1, s3
+; GCN: s_addc_u32 s1, s5, s7
%add = add i64 %v, %a
store i64 %add, ptr undef
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index 6707132..7361e57 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -79,26 +79,26 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <
define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: v_lshr_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshrrev_b16 v0, v1, v0
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_lshr_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b16_e32 v4, v1, v0
@@ -131,24 +131,24 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX10-LABEL: v_lshr_v2i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshrrev_b16 v0, v1, v0
-; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v2, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_lshr_v2i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshrrev_b16 v0, v1, v0
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -353,27 +353,27 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1
define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: lshr_imm_v_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: lshr_imm_v_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: v_mov_b32_e32 v4, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b16_e64 v2, v3, 8
@@ -404,24 +404,24 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace
;
; GFX10-LABEL: lshr_imm_v_v2i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: lshr_imm_v_v2i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -438,26 +438,26 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace
define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: lshr_v_imm_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: lshr_v_imm_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v3
@@ -485,24 +485,24 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace
;
; GFX10-LABEL: lshr_v_imm_v2i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: lshr_v_imm_v2i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -519,27 +519,27 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace
define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: v_lshr_v4i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshrrev_b16 v1, v3, v1
; GFX9-NEXT: v_pk_lshrrev_b16 v0, v2, v0
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_lshr_v4i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b16_e32 v6, v3, v1
@@ -582,26 +582,26 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX10-LABEL: v_lshr_v4i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshrrev_b16 v1, v3, v1
; GFX10-NEXT: v_pk_lshrrev_b16 v0, v2, v0
-; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_lshr_v4i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3]
+; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshrrev_b16 v1, v3, v1
; GFX11-NEXT: v_pk_lshrrev_b16 v0, v2, v0
-; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -620,27 +620,27 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1)
define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: lshr_v_imm_v4i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: lshr_v_imm_v4i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v1
@@ -673,26 +673,26 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace
;
; GFX10-LABEL: lshr_v_imm_v4i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: lshr_v_imm_v4i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
index 995c8c8..5fd0144 100644
--- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
@@ -9,17 +9,17 @@
define amdgpu_kernel void @mad_u16(
; GFX8-LABEL: mad_u16:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s9
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s8, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GFX8-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, s6, v4
+; GFX8-NEXT: v_mov_b32_e32 v5, s11
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s10, v4
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; GFX8-NEXT: flat_load_ushort v6, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -27,57 +27,57 @@ define amdgpu_kernel void @mad_u16(
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_load_ushort v3, v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mad_u16 v2, v6, v2, v3
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: mad_u16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_ushort v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_ushort v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_ushort v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: mad_u16:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_ushort v2, v0, s[4:5] glc dlc
+; GFX10-NEXT: global_load_ushort v2, v0, s[8:9] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_ushort v3, v0, s[6:7] glc dlc
+; GFX10-NEXT: global_load_ushort v3, v0, s[10:11] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mad_u16 v1, v1, v2, v3
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: mad_u16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_u16 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v0, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_u16 v0, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u16 v0, v1, v2, v0
-; GFX11-NEXT: global_store_b16 v3, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v3, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index 400298b..c1c526c 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -959,12 +959,12 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s2, s6, s7
-; GFX11-NEXT: s_mul_hi_u32 s3, s6, s7
-; GFX11-NEXT: s_add_u32 s0, s2, s0
-; GFX11-NEXT: s_addc_u32 s1, s3, s1
+; GFX11-NEXT: s_mul_i32 s0, s6, s7
+; GFX11-NEXT: s_mul_hi_u32 s1, s6, s7
+; GFX11-NEXT: s_add_u32 s0, s0, s2
+; GFX11-NEXT: s_addc_u32 s1, s1, s3
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -976,15 +976,15 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT: s_mov_b32 s3, 0
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s2, s6
+; GFX12-NEXT: s_mov_b32 s0, s6
; GFX12-NEXT: s_mov_b32 s6, s7
-; GFX12-NEXT: s_mov_b32 s7, s3
+; GFX12-NEXT: s_mov_b32 s7, s1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], s[6:7]
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index def0dfa..3bb5732 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -35,14 +35,14 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac
; GFX8-LABEL: madak_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v5, v[0:1]
; GFX8-NEXT: flat_load_dword v2, v[2:3]
@@ -86,12 +86,12 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac
; GFX11-MAD: ; %bb.0:
; GFX11-MAD-NEXT: s_clause 0x1
; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-MAD-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-MAD-NEXT: s_clause 0x1
; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
; GFX11-MAD-NEXT: v_mul_f32_e32 v1, v1, v2
; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -133,12 +133,12 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -504,14 +504,14 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p
; GFX8-LABEL: madak_inline_imm_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v5, v[0:1]
; GFX8-NEXT: flat_load_dword v2, v[2:3]
@@ -555,12 +555,12 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p
; GFX11-MAD: ; %bb.0:
; GFX11-MAD-NEXT: s_clause 0x1
; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-MAD-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-MAD-NEXT: s_clause 0x1
; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
; GFX11-MAD-NEXT: v_mul_f32_e32 v1, v1, v2
; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -602,12 +602,12 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -835,13 +835,13 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a
; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-MAD-NEXT: s_clause 0x1
-; GFX11-MAD-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-MAD-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-MAD-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s2, v1
+; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s4, v1
; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1
-; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-MAD-NEXT: s_nop 0
; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-MAD-NEXT: s_endpgm
@@ -882,11 +882,11 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FMA-NEXT: v_fmaak_f32 v1, s2, v1, 0x41200000
-; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-FMA-NEXT: v_fmaak_f32 v1, s4, v1, 0x41200000
+; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-FMA-NEXT: s_nop 0
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FMA-NEXT: s_endpgm
@@ -1024,20 +1024,20 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %
; GFX8-LABEL: no_madak_src0_modifier_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; GFX8-NEXT: s_mov_b32 s0, 0x41200000
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v5, v[0:1]
; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4
-; GFX8-NEXT: s_mov_b32 s0, 0x41200000
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_f32 v2, |v5|, v2, s0
@@ -1077,12 +1077,12 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %
; GFX11-MAD: ; %bb.0:
; GFX11-MAD-NEXT: s_clause 0x1
; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-MAD-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-MAD-NEXT: s_clause 0x1
; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
; GFX11-MAD-NEXT: v_mul_f32_e64 v1, |v1|, v2
; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1125,12 +1125,12 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, |v1|, v2, 0x41200000
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1177,20 +1177,20 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %
; GFX8-LABEL: no_madak_src1_modifier_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; GFX8-NEXT: s_mov_b32 s0, 0x41200000
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v5, v[0:1]
; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4
-; GFX8-NEXT: s_mov_b32 s0, 0x41200000
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_f32 v2, v5, |v2|, s0
@@ -1230,12 +1230,12 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %
; GFX11-MAD: ; %bb.0:
; GFX11-MAD-NEXT: s_clause 0x1
; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-MAD-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-MAD-NEXT: s_clause 0x1
; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
; GFX11-MAD-NEXT: v_mul_f32_e64 v1, v1, |v2|
; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1278,12 +1278,12 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, v1, |v2|, 0x41200000
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll
index 2b5d32f..e8c6baa 100644
--- a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll
+++ b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll
@@ -397,82 +397,82 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) {
define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) {
; GFX10-LABEL: long_load_chain:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x3e
-; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX10-NEXT: s_load_dword s3, s[0:1], 0x10
-; GFX10-NEXT: s_load_dword s4, s[0:1], 0x20
-; GFX10-NEXT: s_load_dword s5, s[0:1], 0x30
-; GFX10-NEXT: s_load_dword s6, s[0:1], 0x40
-; GFX10-NEXT: s_load_dword s7, s[0:1], 0x50
-; GFX10-NEXT: s_load_dword s8, s[0:1], 0x60
-; GFX10-NEXT: s_load_dword s9, s[0:1], 0x70
-; GFX10-NEXT: s_load_dword s10, s[0:1], 0x80
-; GFX10-NEXT: s_load_dword s11, s[0:1], 0x90
-; GFX10-NEXT: s_load_dword s12, s[0:1], 0xa0
-; GFX10-NEXT: s_load_dword s13, s[0:1], 0xb0
-; GFX10-NEXT: s_load_dword s14, s[0:1], 0xc0
-; GFX10-NEXT: s_load_dword s15, s[0:1], 0xd0
-; GFX10-NEXT: s_load_dword s16, s[0:1], 0xe0
-; GFX10-NEXT: s_load_dword s17, s[0:1], 0xf0
-; GFX10-NEXT: s_load_dword s18, s[0:1], 0x100
-; GFX10-NEXT: s_load_dword s19, s[0:1], 0x110
-; GFX10-NEXT: s_load_dword s20, s[0:1], 0x120
-; GFX10-NEXT: s_load_dword s21, s[0:1], 0x130
-; GFX10-NEXT: s_load_dword s22, s[0:1], 0x140
-; GFX10-NEXT: s_load_dword s23, s[0:1], 0x150
-; GFX10-NEXT: s_load_dword s24, s[0:1], 0x160
-; GFX10-NEXT: s_load_dword s25, s[0:1], 0x170
-; GFX10-NEXT: s_load_dword s26, s[0:1], 0x180
-; GFX10-NEXT: s_load_dword s27, s[0:1], 0x190
-; GFX10-NEXT: s_load_dword s28, s[0:1], 0x1a0
-; GFX10-NEXT: s_load_dword s29, s[0:1], 0x1b0
-; GFX10-NEXT: s_load_dword s30, s[0:1], 0x1c0
-; GFX10-NEXT: s_load_dword s31, s[0:1], 0x1d0
-; GFX10-NEXT: s_load_dword s33, s[0:1], 0x1e0
-; GFX10-NEXT: s_load_dword s34, s[0:1], 0x1f0
-; GFX10-NEXT: s_load_dword s35, s[0:1], 0x200
-; GFX10-NEXT: s_load_dword s36, s[0:1], 0x210
-; GFX10-NEXT: s_load_dword s37, s[0:1], 0x220
-; GFX10-NEXT: s_load_dword s38, s[0:1], 0x230
-; GFX10-NEXT: s_load_dword s39, s[0:1], 0x240
-; GFX10-NEXT: s_load_dword s40, s[0:1], 0x250
-; GFX10-NEXT: s_load_dword s41, s[0:1], 0x260
-; GFX10-NEXT: s_load_dword s42, s[0:1], 0x270
-; GFX10-NEXT: s_load_dword s43, s[0:1], 0x280
-; GFX10-NEXT: s_load_dword s44, s[0:1], 0x290
-; GFX10-NEXT: s_load_dword s45, s[0:1], 0x2a0
-; GFX10-NEXT: s_load_dword s46, s[0:1], 0x2b0
-; GFX10-NEXT: s_load_dword s47, s[0:1], 0x2c0
-; GFX10-NEXT: s_load_dword s48, s[0:1], 0x2d0
-; GFX10-NEXT: s_load_dword s49, s[0:1], 0x2e0
-; GFX10-NEXT: s_load_dword s50, s[0:1], 0x2f0
-; GFX10-NEXT: s_load_dword s51, s[0:1], 0x300
-; GFX10-NEXT: s_load_dword s52, s[0:1], 0x310
-; GFX10-NEXT: s_load_dword s53, s[0:1], 0x320
-; GFX10-NEXT: s_load_dword s54, s[0:1], 0x330
-; GFX10-NEXT: s_load_dword s55, s[0:1], 0x340
-; GFX10-NEXT: s_load_dword s56, s[0:1], 0x350
-; GFX10-NEXT: s_load_dword s57, s[0:1], 0x360
-; GFX10-NEXT: s_load_dword s58, s[0:1], 0x370
-; GFX10-NEXT: s_load_dword s59, s[0:1], 0x380
-; GFX10-NEXT: s_load_dword s60, s[0:1], 0x390
-; GFX10-NEXT: s_load_dword s61, s[0:1], 0x3a0
-; GFX10-NEXT: s_load_dword s62, s[0:1], 0x3b0
-; GFX10-NEXT: s_load_dword s63, s[0:1], 0x3c0
-; GFX10-NEXT: s_load_dword s64, s[0:1], 0x3d0
-; GFX10-NEXT: s_load_dword s65, s[0:1], 0x3e0
+; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s1, s[2:3], 0x10
+; GFX10-NEXT: s_load_dword s4, s[2:3], 0x20
+; GFX10-NEXT: s_load_dword s5, s[2:3], 0x30
+; GFX10-NEXT: s_load_dword s6, s[2:3], 0x40
+; GFX10-NEXT: s_load_dword s7, s[2:3], 0x50
+; GFX10-NEXT: s_load_dword s8, s[2:3], 0x60
+; GFX10-NEXT: s_load_dword s9, s[2:3], 0x70
+; GFX10-NEXT: s_load_dword s10, s[2:3], 0x80
+; GFX10-NEXT: s_load_dword s11, s[2:3], 0x90
+; GFX10-NEXT: s_load_dword s12, s[2:3], 0xa0
+; GFX10-NEXT: s_load_dword s13, s[2:3], 0xb0
+; GFX10-NEXT: s_load_dword s14, s[2:3], 0xc0
+; GFX10-NEXT: s_load_dword s15, s[2:3], 0xd0
+; GFX10-NEXT: s_load_dword s16, s[2:3], 0xe0
+; GFX10-NEXT: s_load_dword s17, s[2:3], 0xf0
+; GFX10-NEXT: s_load_dword s18, s[2:3], 0x100
+; GFX10-NEXT: s_load_dword s19, s[2:3], 0x110
+; GFX10-NEXT: s_load_dword s20, s[2:3], 0x120
+; GFX10-NEXT: s_load_dword s21, s[2:3], 0x130
+; GFX10-NEXT: s_load_dword s22, s[2:3], 0x140
+; GFX10-NEXT: s_load_dword s23, s[2:3], 0x150
+; GFX10-NEXT: s_load_dword s24, s[2:3], 0x160
+; GFX10-NEXT: s_load_dword s25, s[2:3], 0x170
+; GFX10-NEXT: s_load_dword s26, s[2:3], 0x180
+; GFX10-NEXT: s_load_dword s27, s[2:3], 0x190
+; GFX10-NEXT: s_load_dword s28, s[2:3], 0x1a0
+; GFX10-NEXT: s_load_dword s29, s[2:3], 0x1b0
+; GFX10-NEXT: s_load_dword s30, s[2:3], 0x1c0
+; GFX10-NEXT: s_load_dword s31, s[2:3], 0x1d0
+; GFX10-NEXT: s_load_dword s33, s[2:3], 0x1e0
+; GFX10-NEXT: s_load_dword s34, s[2:3], 0x1f0
+; GFX10-NEXT: s_load_dword s35, s[2:3], 0x200
+; GFX10-NEXT: s_load_dword s36, s[2:3], 0x210
+; GFX10-NEXT: s_load_dword s37, s[2:3], 0x220
+; GFX10-NEXT: s_load_dword s38, s[2:3], 0x230
+; GFX10-NEXT: s_load_dword s39, s[2:3], 0x240
+; GFX10-NEXT: s_load_dword s40, s[2:3], 0x250
+; GFX10-NEXT: s_load_dword s41, s[2:3], 0x260
+; GFX10-NEXT: s_load_dword s42, s[2:3], 0x270
+; GFX10-NEXT: s_load_dword s43, s[2:3], 0x280
+; GFX10-NEXT: s_load_dword s44, s[2:3], 0x290
+; GFX10-NEXT: s_load_dword s45, s[2:3], 0x2a0
+; GFX10-NEXT: s_load_dword s46, s[2:3], 0x2b0
+; GFX10-NEXT: s_load_dword s47, s[2:3], 0x2c0
+; GFX10-NEXT: s_load_dword s48, s[2:3], 0x2d0
+; GFX10-NEXT: s_load_dword s49, s[2:3], 0x2e0
+; GFX10-NEXT: s_load_dword s50, s[2:3], 0x2f0
+; GFX10-NEXT: s_load_dword s51, s[2:3], 0x300
+; GFX10-NEXT: s_load_dword s52, s[2:3], 0x310
+; GFX10-NEXT: s_load_dword s53, s[2:3], 0x320
+; GFX10-NEXT: s_load_dword s54, s[2:3], 0x330
+; GFX10-NEXT: s_load_dword s55, s[2:3], 0x340
+; GFX10-NEXT: s_load_dword s56, s[2:3], 0x350
+; GFX10-NEXT: s_load_dword s57, s[2:3], 0x360
+; GFX10-NEXT: s_load_dword s58, s[2:3], 0x370
+; GFX10-NEXT: s_load_dword s59, s[2:3], 0x380
+; GFX10-NEXT: s_load_dword s60, s[2:3], 0x390
+; GFX10-NEXT: s_load_dword s61, s[2:3], 0x3a0
+; GFX10-NEXT: s_load_dword s62, s[2:3], 0x3b0
+; GFX10-NEXT: s_load_dword s63, s[2:3], 0x3c0
+; GFX10-NEXT: s_load_dword s64, s[2:3], 0x3d0
+; GFX10-NEXT: s_load_dword s65, s[2:3], 0x3e0
; GFX10-NEXT: s_clause 0x2
-; GFX10-NEXT: s_load_dword s66, s[0:1], 0x3f0
-; GFX10-NEXT: s_load_dword s67, s[0:1], 0x400
-; GFX10-NEXT: s_load_dword s0, s[0:1], 0x410
+; GFX10-NEXT: s_load_dword s66, s[2:3], 0x3f0
+; GFX10-NEXT: s_load_dword s67, s[2:3], 0x400
+; GFX10-NEXT: s_load_dword s2, s[2:3], 0x410
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use s2
+; GFX10-NEXT: ; use s0
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use s3
+; GFX10-NEXT: ; use s1
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use s4
@@ -664,89 +664,89 @@ define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) {
; GFX10-NEXT: ; use s67
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use s0
+; GFX10-NEXT: ; use s2
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: long_load_chain:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0
-; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x10
-; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x20
-; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x30
-; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x40
-; GFX11-NEXT: s_load_b32 s7, s[0:1], 0x50
-; GFX11-NEXT: s_load_b32 s8, s[0:1], 0x60
-; GFX11-NEXT: s_load_b32 s9, s[0:1], 0x70
-; GFX11-NEXT: s_load_b32 s10, s[0:1], 0x80
-; GFX11-NEXT: s_load_b32 s11, s[0:1], 0x90
-; GFX11-NEXT: s_load_b32 s12, s[0:1], 0xa0
-; GFX11-NEXT: s_load_b32 s13, s[0:1], 0xb0
-; GFX11-NEXT: s_load_b32 s14, s[0:1], 0xc0
-; GFX11-NEXT: s_load_b32 s15, s[0:1], 0xd0
-; GFX11-NEXT: s_load_b32 s16, s[0:1], 0xe0
-; GFX11-NEXT: s_load_b32 s17, s[0:1], 0xf0
-; GFX11-NEXT: s_load_b32 s18, s[0:1], 0x100
-; GFX11-NEXT: s_load_b32 s19, s[0:1], 0x110
-; GFX11-NEXT: s_load_b32 s20, s[0:1], 0x120
-; GFX11-NEXT: s_load_b32 s21, s[0:1], 0x130
-; GFX11-NEXT: s_load_b32 s22, s[0:1], 0x140
-; GFX11-NEXT: s_load_b32 s23, s[0:1], 0x150
-; GFX11-NEXT: s_load_b32 s24, s[0:1], 0x160
-; GFX11-NEXT: s_load_b32 s25, s[0:1], 0x170
-; GFX11-NEXT: s_load_b32 s26, s[0:1], 0x180
-; GFX11-NEXT: s_load_b32 s27, s[0:1], 0x190
-; GFX11-NEXT: s_load_b32 s28, s[0:1], 0x1a0
-; GFX11-NEXT: s_load_b32 s29, s[0:1], 0x1b0
-; GFX11-NEXT: s_load_b32 s30, s[0:1], 0x1c0
-; GFX11-NEXT: s_load_b32 s31, s[0:1], 0x1d0
-; GFX11-NEXT: s_load_b32 s33, s[0:1], 0x1e0
-; GFX11-NEXT: s_load_b32 s34, s[0:1], 0x1f0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x10
+; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x20
+; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x30
+; GFX11-NEXT: s_load_b32 s6, s[2:3], 0x40
+; GFX11-NEXT: s_load_b32 s7, s[2:3], 0x50
+; GFX11-NEXT: s_load_b32 s8, s[2:3], 0x60
+; GFX11-NEXT: s_load_b32 s9, s[2:3], 0x70
+; GFX11-NEXT: s_load_b32 s10, s[2:3], 0x80
+; GFX11-NEXT: s_load_b32 s11, s[2:3], 0x90
+; GFX11-NEXT: s_load_b32 s12, s[2:3], 0xa0
+; GFX11-NEXT: s_load_b32 s13, s[2:3], 0xb0
+; GFX11-NEXT: s_load_b32 s14, s[2:3], 0xc0
+; GFX11-NEXT: s_load_b32 s15, s[2:3], 0xd0
+; GFX11-NEXT: s_load_b32 s16, s[2:3], 0xe0
+; GFX11-NEXT: s_load_b32 s17, s[2:3], 0xf0
+; GFX11-NEXT: s_load_b32 s18, s[2:3], 0x100
+; GFX11-NEXT: s_load_b32 s19, s[2:3], 0x110
+; GFX11-NEXT: s_load_b32 s20, s[2:3], 0x120
+; GFX11-NEXT: s_load_b32 s21, s[2:3], 0x130
+; GFX11-NEXT: s_load_b32 s22, s[2:3], 0x140
+; GFX11-NEXT: s_load_b32 s23, s[2:3], 0x150
+; GFX11-NEXT: s_load_b32 s24, s[2:3], 0x160
+; GFX11-NEXT: s_load_b32 s25, s[2:3], 0x170
+; GFX11-NEXT: s_load_b32 s26, s[2:3], 0x180
+; GFX11-NEXT: s_load_b32 s27, s[2:3], 0x190
+; GFX11-NEXT: s_load_b32 s28, s[2:3], 0x1a0
+; GFX11-NEXT: s_load_b32 s29, s[2:3], 0x1b0
+; GFX11-NEXT: s_load_b32 s30, s[2:3], 0x1c0
+; GFX11-NEXT: s_load_b32 s31, s[2:3], 0x1d0
+; GFX11-NEXT: s_load_b32 s33, s[2:3], 0x1e0
+; GFX11-NEXT: s_load_b32 s34, s[2:3], 0x1f0
; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: s_load_b32 s35, s[0:1], 0x200
-; GFX11-NEXT: s_load_b32 s36, s[0:1], 0x210
-; GFX11-NEXT: s_load_b32 s37, s[0:1], 0x220
-; GFX11-NEXT: s_load_b32 s38, s[0:1], 0x230
-; GFX11-NEXT: s_load_b32 s39, s[0:1], 0x240
-; GFX11-NEXT: s_load_b32 s40, s[0:1], 0x250
-; GFX11-NEXT: s_load_b32 s41, s[0:1], 0x260
-; GFX11-NEXT: s_load_b32 s42, s[0:1], 0x270
-; GFX11-NEXT: s_load_b32 s43, s[0:1], 0x280
-; GFX11-NEXT: s_load_b32 s44, s[0:1], 0x290
-; GFX11-NEXT: s_load_b32 s45, s[0:1], 0x2a0
-; GFX11-NEXT: s_load_b32 s46, s[0:1], 0x2b0
-; GFX11-NEXT: s_load_b32 s47, s[0:1], 0x2c0
-; GFX11-NEXT: s_load_b32 s48, s[0:1], 0x2d0
-; GFX11-NEXT: s_load_b32 s49, s[0:1], 0x2e0
-; GFX11-NEXT: s_load_b32 s50, s[0:1], 0x2f0
-; GFX11-NEXT: s_load_b32 s51, s[0:1], 0x300
-; GFX11-NEXT: s_load_b32 s52, s[0:1], 0x310
-; GFX11-NEXT: s_load_b32 s53, s[0:1], 0x320
-; GFX11-NEXT: s_load_b32 s54, s[0:1], 0x330
-; GFX11-NEXT: s_load_b32 s55, s[0:1], 0x340
-; GFX11-NEXT: s_load_b32 s56, s[0:1], 0x350
-; GFX11-NEXT: s_load_b32 s57, s[0:1], 0x360
-; GFX11-NEXT: s_load_b32 s58, s[0:1], 0x370
-; GFX11-NEXT: s_load_b32 s59, s[0:1], 0x380
-; GFX11-NEXT: s_load_b32 s60, s[0:1], 0x390
-; GFX11-NEXT: s_load_b32 s61, s[0:1], 0x3a0
-; GFX11-NEXT: s_load_b32 s62, s[0:1], 0x3b0
-; GFX11-NEXT: s_load_b32 s63, s[0:1], 0x3c0
-; GFX11-NEXT: s_load_b32 s64, s[0:1], 0x3d0
-; GFX11-NEXT: s_load_b32 s65, s[0:1], 0x3e0
-; GFX11-NEXT: s_load_b32 s66, s[0:1], 0x3f0
+; GFX11-NEXT: s_load_b32 s35, s[2:3], 0x200
+; GFX11-NEXT: s_load_b32 s36, s[2:3], 0x210
+; GFX11-NEXT: s_load_b32 s37, s[2:3], 0x220
+; GFX11-NEXT: s_load_b32 s38, s[2:3], 0x230
+; GFX11-NEXT: s_load_b32 s39, s[2:3], 0x240
+; GFX11-NEXT: s_load_b32 s40, s[2:3], 0x250
+; GFX11-NEXT: s_load_b32 s41, s[2:3], 0x260
+; GFX11-NEXT: s_load_b32 s42, s[2:3], 0x270
+; GFX11-NEXT: s_load_b32 s43, s[2:3], 0x280
+; GFX11-NEXT: s_load_b32 s44, s[2:3], 0x290
+; GFX11-NEXT: s_load_b32 s45, s[2:3], 0x2a0
+; GFX11-NEXT: s_load_b32 s46, s[2:3], 0x2b0
+; GFX11-NEXT: s_load_b32 s47, s[2:3], 0x2c0
+; GFX11-NEXT: s_load_b32 s48, s[2:3], 0x2d0
+; GFX11-NEXT: s_load_b32 s49, s[2:3], 0x2e0
+; GFX11-NEXT: s_load_b32 s50, s[2:3], 0x2f0
+; GFX11-NEXT: s_load_b32 s51, s[2:3], 0x300
+; GFX11-NEXT: s_load_b32 s52, s[2:3], 0x310
+; GFX11-NEXT: s_load_b32 s53, s[2:3], 0x320
+; GFX11-NEXT: s_load_b32 s54, s[2:3], 0x330
+; GFX11-NEXT: s_load_b32 s55, s[2:3], 0x340
+; GFX11-NEXT: s_load_b32 s56, s[2:3], 0x350
+; GFX11-NEXT: s_load_b32 s57, s[2:3], 0x360
+; GFX11-NEXT: s_load_b32 s58, s[2:3], 0x370
+; GFX11-NEXT: s_load_b32 s59, s[2:3], 0x380
+; GFX11-NEXT: s_load_b32 s60, s[2:3], 0x390
+; GFX11-NEXT: s_load_b32 s61, s[2:3], 0x3a0
+; GFX11-NEXT: s_load_b32 s62, s[2:3], 0x3b0
+; GFX11-NEXT: s_load_b32 s63, s[2:3], 0x3c0
+; GFX11-NEXT: s_load_b32 s64, s[2:3], 0x3d0
+; GFX11-NEXT: s_load_b32 s65, s[2:3], 0x3e0
+; GFX11-NEXT: s_load_b32 s66, s[2:3], 0x3f0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s67, s[0:1], 0x400
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x410
+; GFX11-NEXT: s_load_b32 s67, s[2:3], 0x400
+; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x410
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s2
+; GFX11-NEXT: ; use s0
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s3
+; GFX11-NEXT: ; use s1
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s4
@@ -938,89 +938,89 @@ define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) {
; GFX11-NEXT: ; use s67
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s0
+; GFX11-NEXT: ; use s2
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: long_load_chain:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1f
-; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0
-; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x10
-; GFX12-NEXT: s_load_b32 s4, s[0:1], 0x20
-; GFX12-NEXT: s_load_b32 s5, s[0:1], 0x30
-; GFX12-NEXT: s_load_b32 s6, s[0:1], 0x40
-; GFX12-NEXT: s_load_b32 s7, s[0:1], 0x50
-; GFX12-NEXT: s_load_b32 s8, s[0:1], 0x60
-; GFX12-NEXT: s_load_b32 s9, s[0:1], 0x70
-; GFX12-NEXT: s_load_b32 s10, s[0:1], 0x80
-; GFX12-NEXT: s_load_b32 s11, s[0:1], 0x90
-; GFX12-NEXT: s_load_b32 s12, s[0:1], 0xa0
-; GFX12-NEXT: s_load_b32 s13, s[0:1], 0xb0
-; GFX12-NEXT: s_load_b32 s14, s[0:1], 0xc0
-; GFX12-NEXT: s_load_b32 s15, s[0:1], 0xd0
-; GFX12-NEXT: s_load_b32 s16, s[0:1], 0xe0
-; GFX12-NEXT: s_load_b32 s17, s[0:1], 0xf0
-; GFX12-NEXT: s_load_b32 s18, s[0:1], 0x100
-; GFX12-NEXT: s_load_b32 s19, s[0:1], 0x110
-; GFX12-NEXT: s_load_b32 s20, s[0:1], 0x120
-; GFX12-NEXT: s_load_b32 s21, s[0:1], 0x130
-; GFX12-NEXT: s_load_b32 s22, s[0:1], 0x140
-; GFX12-NEXT: s_load_b32 s23, s[0:1], 0x150
-; GFX12-NEXT: s_load_b32 s24, s[0:1], 0x160
-; GFX12-NEXT: s_load_b32 s25, s[0:1], 0x170
-; GFX12-NEXT: s_load_b32 s26, s[0:1], 0x180
-; GFX12-NEXT: s_load_b32 s27, s[0:1], 0x190
-; GFX12-NEXT: s_load_b32 s28, s[0:1], 0x1a0
-; GFX12-NEXT: s_load_b32 s29, s[0:1], 0x1b0
-; GFX12-NEXT: s_load_b32 s30, s[0:1], 0x1c0
-; GFX12-NEXT: s_load_b32 s31, s[0:1], 0x1d0
-; GFX12-NEXT: s_load_b32 s33, s[0:1], 0x1e0
-; GFX12-NEXT: s_load_b32 s34, s[0:1], 0x1f0
+; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s1, s[2:3], 0x10
+; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x20
+; GFX12-NEXT: s_load_b32 s5, s[2:3], 0x30
+; GFX12-NEXT: s_load_b32 s6, s[2:3], 0x40
+; GFX12-NEXT: s_load_b32 s7, s[2:3], 0x50
+; GFX12-NEXT: s_load_b32 s8, s[2:3], 0x60
+; GFX12-NEXT: s_load_b32 s9, s[2:3], 0x70
+; GFX12-NEXT: s_load_b32 s10, s[2:3], 0x80
+; GFX12-NEXT: s_load_b32 s11, s[2:3], 0x90
+; GFX12-NEXT: s_load_b32 s12, s[2:3], 0xa0
+; GFX12-NEXT: s_load_b32 s13, s[2:3], 0xb0
+; GFX12-NEXT: s_load_b32 s14, s[2:3], 0xc0
+; GFX12-NEXT: s_load_b32 s15, s[2:3], 0xd0
+; GFX12-NEXT: s_load_b32 s16, s[2:3], 0xe0
+; GFX12-NEXT: s_load_b32 s17, s[2:3], 0xf0
+; GFX12-NEXT: s_load_b32 s18, s[2:3], 0x100
+; GFX12-NEXT: s_load_b32 s19, s[2:3], 0x110
+; GFX12-NEXT: s_load_b32 s20, s[2:3], 0x120
+; GFX12-NEXT: s_load_b32 s21, s[2:3], 0x130
+; GFX12-NEXT: s_load_b32 s22, s[2:3], 0x140
+; GFX12-NEXT: s_load_b32 s23, s[2:3], 0x150
+; GFX12-NEXT: s_load_b32 s24, s[2:3], 0x160
+; GFX12-NEXT: s_load_b32 s25, s[2:3], 0x170
+; GFX12-NEXT: s_load_b32 s26, s[2:3], 0x180
+; GFX12-NEXT: s_load_b32 s27, s[2:3], 0x190
+; GFX12-NEXT: s_load_b32 s28, s[2:3], 0x1a0
+; GFX12-NEXT: s_load_b32 s29, s[2:3], 0x1b0
+; GFX12-NEXT: s_load_b32 s30, s[2:3], 0x1c0
+; GFX12-NEXT: s_load_b32 s31, s[2:3], 0x1d0
+; GFX12-NEXT: s_load_b32 s33, s[2:3], 0x1e0
+; GFX12-NEXT: s_load_b32 s34, s[2:3], 0x1f0
; GFX12-NEXT: s_clause 0x1f
-; GFX12-NEXT: s_load_b32 s35, s[0:1], 0x200
-; GFX12-NEXT: s_load_b32 s36, s[0:1], 0x210
-; GFX12-NEXT: s_load_b32 s37, s[0:1], 0x220
-; GFX12-NEXT: s_load_b32 s38, s[0:1], 0x230
-; GFX12-NEXT: s_load_b32 s39, s[0:1], 0x240
-; GFX12-NEXT: s_load_b32 s40, s[0:1], 0x250
-; GFX12-NEXT: s_load_b32 s41, s[0:1], 0x260
-; GFX12-NEXT: s_load_b32 s42, s[0:1], 0x270
-; GFX12-NEXT: s_load_b32 s43, s[0:1], 0x280
-; GFX12-NEXT: s_load_b32 s44, s[0:1], 0x290
-; GFX12-NEXT: s_load_b32 s45, s[0:1], 0x2a0
-; GFX12-NEXT: s_load_b32 s46, s[0:1], 0x2b0
-; GFX12-NEXT: s_load_b32 s47, s[0:1], 0x2c0
-; GFX12-NEXT: s_load_b32 s48, s[0:1], 0x2d0
-; GFX12-NEXT: s_load_b32 s49, s[0:1], 0x2e0
-; GFX12-NEXT: s_load_b32 s50, s[0:1], 0x2f0
-; GFX12-NEXT: s_load_b32 s51, s[0:1], 0x300
-; GFX12-NEXT: s_load_b32 s52, s[0:1], 0x310
-; GFX12-NEXT: s_load_b32 s53, s[0:1], 0x320
-; GFX12-NEXT: s_load_b32 s54, s[0:1], 0x330
-; GFX12-NEXT: s_load_b32 s55, s[0:1], 0x340
-; GFX12-NEXT: s_load_b32 s56, s[0:1], 0x350
-; GFX12-NEXT: s_load_b32 s57, s[0:1], 0x360
-; GFX12-NEXT: s_load_b32 s58, s[0:1], 0x370
-; GFX12-NEXT: s_load_b32 s59, s[0:1], 0x380
-; GFX12-NEXT: s_load_b32 s60, s[0:1], 0x390
-; GFX12-NEXT: s_load_b32 s61, s[0:1], 0x3a0
-; GFX12-NEXT: s_load_b32 s62, s[0:1], 0x3b0
-; GFX12-NEXT: s_load_b32 s63, s[0:1], 0x3c0
-; GFX12-NEXT: s_load_b32 s64, s[0:1], 0x3d0
-; GFX12-NEXT: s_load_b32 s65, s[0:1], 0x3e0
-; GFX12-NEXT: s_load_b32 s66, s[0:1], 0x3f0
+; GFX12-NEXT: s_load_b32 s35, s[2:3], 0x200
+; GFX12-NEXT: s_load_b32 s36, s[2:3], 0x210
+; GFX12-NEXT: s_load_b32 s37, s[2:3], 0x220
+; GFX12-NEXT: s_load_b32 s38, s[2:3], 0x230
+; GFX12-NEXT: s_load_b32 s39, s[2:3], 0x240
+; GFX12-NEXT: s_load_b32 s40, s[2:3], 0x250
+; GFX12-NEXT: s_load_b32 s41, s[2:3], 0x260
+; GFX12-NEXT: s_load_b32 s42, s[2:3], 0x270
+; GFX12-NEXT: s_load_b32 s43, s[2:3], 0x280
+; GFX12-NEXT: s_load_b32 s44, s[2:3], 0x290
+; GFX12-NEXT: s_load_b32 s45, s[2:3], 0x2a0
+; GFX12-NEXT: s_load_b32 s46, s[2:3], 0x2b0
+; GFX12-NEXT: s_load_b32 s47, s[2:3], 0x2c0
+; GFX12-NEXT: s_load_b32 s48, s[2:3], 0x2d0
+; GFX12-NEXT: s_load_b32 s49, s[2:3], 0x2e0
+; GFX12-NEXT: s_load_b32 s50, s[2:3], 0x2f0
+; GFX12-NEXT: s_load_b32 s51, s[2:3], 0x300
+; GFX12-NEXT: s_load_b32 s52, s[2:3], 0x310
+; GFX12-NEXT: s_load_b32 s53, s[2:3], 0x320
+; GFX12-NEXT: s_load_b32 s54, s[2:3], 0x330
+; GFX12-NEXT: s_load_b32 s55, s[2:3], 0x340
+; GFX12-NEXT: s_load_b32 s56, s[2:3], 0x350
+; GFX12-NEXT: s_load_b32 s57, s[2:3], 0x360
+; GFX12-NEXT: s_load_b32 s58, s[2:3], 0x370
+; GFX12-NEXT: s_load_b32 s59, s[2:3], 0x380
+; GFX12-NEXT: s_load_b32 s60, s[2:3], 0x390
+; GFX12-NEXT: s_load_b32 s61, s[2:3], 0x3a0
+; GFX12-NEXT: s_load_b32 s62, s[2:3], 0x3b0
+; GFX12-NEXT: s_load_b32 s63, s[2:3], 0x3c0
+; GFX12-NEXT: s_load_b32 s64, s[2:3], 0x3d0
+; GFX12-NEXT: s_load_b32 s65, s[2:3], 0x3e0
+; GFX12-NEXT: s_load_b32 s66, s[2:3], 0x3f0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b32 s67, s[0:1], 0x400
-; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x410
+; GFX12-NEXT: s_load_b32 s67, s[2:3], 0x400
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x410
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s2
+; GFX12-NEXT: ; use s0
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s3
+; GFX12-NEXT: ; use s1
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s4
@@ -1212,7 +1212,7 @@ define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) {
; GFX12-NEXT: ; use s67
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s0
+; GFX12-NEXT: ; use s2
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: s_endpgm
%v0 = load i32, ptr addrspace(1) %p
diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll
index 8ef2ca2..920b6cc 100644
--- a/llvm/test/CodeGen/AMDGPU/max.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll
@@ -7,14 +7,14 @@ define amdgpu_kernel void @v_test_imax_sge_i16(ptr addrspace(1) %out, ptr addrsp
; VI-LABEL: v_test_imax_sge_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v5, v[0:1]
; VI-NEXT: flat_load_ushort v2, v[2:3]
@@ -55,14 +55,14 @@ define amdgpu_kernel void @v_test_imax_sge_v2i16(ptr addrspace(1) %out, ptr addr
; VI-LABEL: v_test_imax_sge_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
@@ -105,14 +105,14 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addr
; VI-LABEL: v_test_imax_sge_v3i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v6
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v6
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
@@ -176,14 +176,14 @@ define amdgpu_kernel void @v_test_imax_sge_v4i16(ptr addrspace(1) %out, ptr addr
; VI-LABEL: v_test_imax_sge_v4i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -230,14 +230,14 @@ define amdgpu_kernel void @v_test_imax_sgt_i16(ptr addrspace(1) %out, ptr addrsp
; VI-LABEL: v_test_imax_sgt_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v5, v[0:1]
; VI-NEXT: flat_load_ushort v2, v[2:3]
@@ -278,14 +278,14 @@ define amdgpu_kernel void @v_test_umax_uge_i16(ptr addrspace(1) %out, ptr addrsp
; VI-LABEL: v_test_umax_uge_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v5, v[0:1]
; VI-NEXT: flat_load_ushort v2, v[2:3]
@@ -326,14 +326,14 @@ define amdgpu_kernel void @v_test_umax_ugt_i16(ptr addrspace(1) %out, ptr addrsp
; VI-LABEL: v_test_umax_ugt_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v5, v[0:1]
; VI-NEXT: flat_load_ushort v2, v[2:3]
@@ -373,14 +373,14 @@ define amdgpu_kernel void @v_test_umax_ugt_v2i16(ptr addrspace(1) %out, ptr addr
; VI-LABEL: v_test_umax_ugt_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index 5c88328..51b6410 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -267,9 +267,10 @@ define amdgpu_kernel void @vector_clause_indirect(ptr addrspace(1) noalias nocap
;
; GCN-SCRATCH-LABEL: vector_clause_indirect:
; GCN-SCRATCH: ; %bb.0: ; %bb
+; GCN-SCRATCH-NEXT: s_clause 0x1
; GCN-SCRATCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GCN-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GCN-SCRATCH-NEXT: v_mov_b32_e32 v8, 0
; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GCN-SCRATCH-NEXT: global_load_dwordx2 v[4:5], v0, s[2:3]
@@ -278,9 +279,9 @@ define amdgpu_kernel void @vector_clause_indirect(ptr addrspace(1) noalias nocap
; GCN-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[4:5], off
; GCN-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v[4:5], off offset:16
; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(1)
-; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0)
-; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[4:7], s[4:5] offset:16
; GCN-SCRATCH-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -416,22 +417,22 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc
; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; GCN-SCRATCH-NEXT: s_clause 0x1
-; GCN-SCRATCH-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24
-; GCN-SCRATCH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x44
+; GCN-SCRATCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-SCRATCH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40b00000
-; GCN-SCRATCH-NEXT: s_brev_b32 s8, 1
-; GCN-SCRATCH-NEXT: s_mov_b32 s9, s8
+; GCN-SCRATCH-NEXT: s_brev_b32 s0, 1
+; GCN-SCRATCH-NEXT: s_mov_b32 s1, s0
; GCN-SCRATCH-NEXT: scratch_store_dword off, v0, off
; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GCN-SCRATCH-NEXT: ;;#ASMSTART
; GCN-SCRATCH-NEXT: ;;#ASMEND
; GCN-SCRATCH-NEXT: scratch_load_dword v2, off, off
; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, s10
-; GCN-SCRATCH-NEXT: v_mov_b32_e32 v1, s11
-; GCN-SCRATCH-NEXT: s_mov_b32 s11, 0
-; GCN-SCRATCH-NEXT: s_mov_b32 s10, s8
-; GCN-SCRATCH-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, s2
+; GCN-SCRATCH-NEXT: v_mov_b32_e32 v1, s3
+; GCN-SCRATCH-NEXT: s_mov_b32 s3, 0
+; GCN-SCRATCH-NEXT: s_mov_b32 s2, s0
+; GCN-SCRATCH-NEXT: image_sample v0, v[0:1], s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GCN-SCRATCH-NEXT: v_add_f32_e32 v0, v2, v0
; GCN-SCRATCH-NEXT: exp mrt0 v0, off, off, off done vm
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 2334543..9586684 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -2478,20 +2478,20 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
;
; VI-LABEL: v_test_umin_ult_i32_multi_use:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
+; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[4:5], 0x0
-; VI-NEXT: s_load_dword s5, s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_load_dword s2, s[12:13], 0x0
+; VI-NEXT: s_load_dword s3, s[14:15], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lt_u32 s4, s5
+; VI-NEXT: s_cmp_lt_u32 s2, s3
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; VI-NEXT: s_cselect_b32 s0, s4, s5
+; VI-NEXT: s_cselect_b32 s0, s2, s3
; VI-NEXT: v_mov_b32_e32 v5, s0
; VI-NEXT: flat_store_dword v[0:1], v5
; VI-NEXT: flat_store_byte v[2:3], v4
@@ -2499,58 +2499,58 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
;
; GFX9-LABEL: v_test_umin_ult_i32_multi_use:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s8, s[4:5], 0x0
-; GFX9-NEXT: s_load_dword s9, s[6:7], 0x0
+; GFX9-NEXT: s_load_dword s2, s[12:13], 0x0
+; GFX9-NEXT: s_load_dword s3, s[14:15], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_cmp_lt_u32 s8, s9
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
-; GFX9-NEXT: s_cselect_b32 s4, s8, s9
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT: s_cmp_lt_u32 s2, s3
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, s2, s3
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: global_store_dword v0, v2, s[8:9]
+; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_umin_ult_i32_multi_use:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s8, s[4:5], 0x0
-; GFX10-NEXT: s_load_dword s9, s[6:7], 0x0
+; GFX10-NEXT: s_load_dword s0, s[12:13], 0x0
+; GFX10-NEXT: s_load_dword s1, s[14:15], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_cmp_lt_u32 s8, s9
-; GFX10-NEXT: s_cselect_b32 s4, -1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX10-NEXT: s_and_b32 s4, s4, exec_lo
-; GFX10-NEXT: s_cselect_b32 s4, s8, s9
-; GFX10-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-NEXT: global_store_dword v1, v2, s[0:1]
-; GFX10-NEXT: global_store_byte v1, v0, s[2:3]
+; GFX10-NEXT: s_cmp_lt_u32 s0, s1
+; GFX10-NEXT: s_cselect_b32 s2, -1, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX10-NEXT: s_and_b32 s2, s2, exec_lo
+; GFX10-NEXT: s_cselect_b32 s0, s0, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: global_store_dword v1, v2, s[8:9]
+; GFX10-NEXT: global_store_byte v1, v0, s[10:11]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_umin_ult_i32_multi_use:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0
-; GFX11-NEXT: s_load_b32 s5, s[6:7], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[8:9], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[10:11], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lt_u32 s4, s5
-; GFX11-NEXT: s_cselect_b32 s6, -1, 0
+; GFX11-NEXT: s_cmp_lt_u32 s0, s1
+; GFX11-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6
-; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
-; GFX11-NEXT: s_cselect_b32 s4, s4, s5
-; GFX11-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-NEXT: s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT: s_cselect_b32 s0, s0, s1
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b32 v1, v2, s[0:1]
-; GFX11-NEXT: global_store_b8 v1, v0, s[2:3]
+; GFX11-NEXT: global_store_b32 v1, v2, s[4:5]
+; GFX11-NEXT: global_store_b8 v1, v0, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2629,18 +2629,18 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
;
; VI-LABEL: v_test_umin_ult_i16_multi_use:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
+; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s12
+; VI-NEXT: v_mov_b32_e32 v1, s13
+; VI-NEXT: v_mov_b32_e32 v2, s14
+; VI-NEXT: v_mov_b32_e32 v3, s15
; VI-NEXT: flat_load_ushort v4, v[0:1]
; VI-NEXT: flat_load_ushort v5, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v5
; VI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
@@ -2651,50 +2651,50 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
;
; GFX9-LABEL: v_test_umin_ult_i16_multi_use:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[4:5]
-; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
+; GFX9-NEXT: global_load_ushort v1, v0, s[12:13]
+; GFX9-NEXT: global_load_ushort v2, v0, s[14:15]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_umin_ult_i16_multi_use:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_ushort v1, v0, s[4:5]
-; GFX10-NEXT: global_load_ushort v2, v0, s[6:7]
+; GFX10-NEXT: global_load_ushort v1, v0, s[12:13]
+; GFX10-NEXT: global_load_ushort v2, v0, s[14:15]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
-; GFX10-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX10-NEXT: global_store_short v0, v1, s[8:9]
+; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_umin_ult_i16_multi_use:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_u16 v1, v0, s[4:5]
-; GFX11-NEXT: global_load_u16 v2, v0, s[6:7]
+; GFX11-NEXT: global_load_u16 v1, v0, s[8:9]
+; GFX11-NEXT: global_load_u16 v2, v0, s[10:11]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
+; GFX11-NEXT: global_store_b8 v0, v2, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll
index 99120ab..70082e9 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll
@@ -7,16 +7,16 @@ define amdgpu_kernel void @add_reg_imm(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 28744523
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1395630315
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO [[GLOBAL_LOAD_DWORDX2_SADDR]], killed [[REG_SEQUENCE]], implicit-def $vcc_lo, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_ADD_U]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%lhs = load volatile i64, ptr addrspace(1) %ptr
%res = add i64 %lhs, 123456789123456789
@@ -30,15 +30,15 @@ define amdgpu_kernel void @add_reg_reg(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile load (s64) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile load (s64) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO [[GLOBAL_LOAD_DWORDX2_SADDR]], [[GLOBAL_LOAD_DWORDX2_SADDR1]], implicit-def $vcc_lo, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_ADD_U]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%lhs = load volatile i64, ptr addrspace(1) %ptr
%rhs = load volatile i64, ptr addrspace(1) %ptr
@@ -53,16 +53,16 @@ define amdgpu_kernel void @sub_reg_imm(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -28744524
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1395630315
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO [[GLOBAL_LOAD_DWORDX2_SADDR]], killed [[REG_SEQUENCE]], implicit-def $vcc_lo, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_ADD_U]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%lhs = load volatile i64, ptr addrspace(1) %ptr
%res = sub i64 %lhs, 123456789123456789
@@ -76,16 +76,16 @@ define amdgpu_kernel void @sub_imm_reg(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 28744523
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1395630315
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[V_SUB_U:%[0-9]+]]:vreg_64 = V_SUB_U64_PSEUDO killed [[REG_SEQUENCE]], [[GLOBAL_LOAD_DWORDX2_SADDR]], implicit-def $vcc_lo, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_SUB_U]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%rhs = load volatile i64, ptr addrspace(1) %ptr
%res = sub i64 123456789123456789, %rhs
@@ -99,15 +99,15 @@ define amdgpu_kernel void @sub_reg_reg(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile load (s64) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile load (s64) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[V_SUB_U:%[0-9]+]]:vreg_64 = V_SUB_U64_PSEUDO [[GLOBAL_LOAD_DWORDX2_SADDR]], [[GLOBAL_LOAD_DWORDX2_SADDR1]], implicit-def $vcc_lo, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_SUB_U]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%lhs = load volatile i64, ptr addrspace(1) %ptr
%rhs = load volatile i64, ptr addrspace(1) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
index 1cd9afe..db33ee8 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
@@ -7,13 +7,13 @@ define amdgpu_kernel void @exp_f32(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_EXP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_EXP_F32_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile float, ptr addrspace(1) %ptr
%res = call float @llvm.amdgcn.exp2.f32(float %val)
@@ -27,14 +27,14 @@ define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_EXP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_EXP_F16_fake16_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile half, ptr addrspace(1) %ptr
%res = call half @llvm.amdgcn.exp2.f16(half %val)
@@ -48,13 +48,13 @@ define amdgpu_kernel void @log_f32(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_LOG_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_LOG_F32_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile float, ptr addrspace(1) %ptr
%res = call float @llvm.amdgcn.log.f32(float %val)
@@ -68,14 +68,14 @@ define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_LOG_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_LOG_F16_fake16_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile half, ptr addrspace(1) %ptr
%res = call half @llvm.amdgcn.log.f16(half %val)
@@ -89,13 +89,13 @@ define amdgpu_kernel void @rcp_f32(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RCP_F32_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile float, ptr addrspace(1) %ptr
%res = call float @llvm.amdgcn.rcp.f32(float %val)
@@ -109,14 +109,14 @@ define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_RCP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RCP_F16_fake16_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile half, ptr addrspace(1) %ptr
%res = call half @llvm.amdgcn.rcp.f16(half %val)
@@ -130,13 +130,13 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_RSQ_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RSQ_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RSQ_F32_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile float, ptr addrspace(1) %ptr
%res = call float @llvm.amdgcn.rsq.f32(float %val)
@@ -150,14 +150,14 @@ define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_RSQ_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RSQ_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RSQ_F16_fake16_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile half, ptr addrspace(1) %ptr
%res = call half @llvm.amdgcn.rsq.f16(half %val)
@@ -171,13 +171,13 @@ define amdgpu_kernel void @sqrt_f32(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_SQRT_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SQRT_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_SQRT_F32_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile float, ptr addrspace(1) %ptr
%res = call float @llvm.amdgcn.sqrt.f32(float %val)
@@ -191,14 +191,14 @@ define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_SQRT_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SQRT_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_SQRT_F16_fake16_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile half, ptr addrspace(1) %ptr
%res = call half @llvm.amdgcn.sqrt.f16(half %val)
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 3c60153..c0e0b50 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -31,99 +31,99 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: test_mul_v2i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_lo_u32 v1, v1, v3
; VI-NEXT: v_mul_lo_u32 v0, v0, v2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_mul_v2i32:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3
; GFX9-NEXT: v_mul_lo_u32 v0, v0, v2
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: test_mul_v2i32:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_lo_u32 v1, v1, v3
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_mul_v2i32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_lo_u32 v1, v1, v3
; GFX11-NEXT: v_mul_lo_u32 v0, v0, v2
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: test_mul_v2i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
+; GFX12-NEXT: s_mov_b32 s0, s4
; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null
-; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_lo_u32 v1, v1, v3
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -179,117 +179,117 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_mul_v4i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_lo_u32 v3, v3, v7
; VI-NEXT: v_mul_lo_u32 v2, v2, v6
; VI-NEXT: v_mul_lo_u32 v1, v1, v5
; VI-NEXT: v_mul_lo_u32 v0, v0, v4
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_mul_v4i32:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v3, v3, v7
; GFX9-NEXT: v_mul_lo_u32 v2, v2, v6
; GFX9-NEXT: v_mul_lo_u32 v1, v1, v5
; GFX9-NEXT: v_mul_lo_u32 v0, v0, v4
-; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_mul_v4i32:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GFX10-NEXT: s_mov_b32 s4, s0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s0, s4
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_lo_u32 v3, v3, v7
; GFX10-NEXT: v_mul_lo_u32 v2, v2, v6
; GFX10-NEXT: v_mul_lo_u32 v1, v1, v5
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v4
-; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_v4i32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0
; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[8:11], 0 offset:16
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_lo_u32 v3, v3, v7
; GFX11-NEXT: v_mul_lo_u32 v2, v2, v6
; GFX11-NEXT: v_mul_lo_u32 v1, v1, v5
; GFX11-NEXT: v_mul_lo_u32 v0, v0, v4
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_mul_v4i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null
; GFX12-NEXT: buffer_load_b128 v[4:7], off, s[8:11], null offset:16
-; GFX12-NEXT: s_mov_b32 s4, s0
-; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_mov_b32 s0, s4
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_lo_u32 v3, v3, v7
; GFX12-NEXT: v_mul_lo_u32 v2, v2, v6
; GFX12-NEXT: v_mul_lo_u32 v1, v1, v5
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v4
-; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[4:7], null
+; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -524,23 +524,23 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
-; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: buffer_load_b32 v1, off, s[8:11], 0
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_lo_u32 v0, v1, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -549,23 +549,23 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT: s_mov_b32 s10, -1
-; GFX12-NEXT: s_mov_b32 s11, 0x31016000
-; GFX12-NEXT: s_mov_b32 s14, s10
-; GFX12-NEXT: s_mov_b32 s15, s11
-; GFX12-NEXT: s_mov_b32 s2, s10
-; GFX12-NEXT: s_mov_b32 s3, s11
+; GFX12-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s2
+; GFX12-NEXT: s_mov_b32 s15, s3
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s12, s6
; GFX12-NEXT: s_mov_b32 s13, s7
; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null
-; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null
-; GFX12-NEXT: s_mov_b32 s8, s4
-; GFX12-NEXT: s_mov_b32 s9, s5
+; GFX12-NEXT: buffer_load_b32 v1, off, s[8:11], null
+; GFX12-NEXT: s_mov_b32 s0, s4
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_lo_u32 v0, v1, v0
-; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -618,14 +618,13 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) {
; VI-LABEL: mul64_sext_c:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x50
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], s2, v0, 0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_nop 2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: v_mad_i64_i32 v[0:1], s[0:1], s2, v0, 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: mul64_sext_c:
@@ -661,15 +660,15 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s3, s2, 0x50
-; GFX11-NEXT: s_mul_hi_i32 s2, s2, 0x50
+; GFX11-NEXT: s_mul_i32 s0, s2, 0x50
+; GFX11-NEXT: s_mul_hi_i32 s1, s2, 0x50
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -729,14 +728,13 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) {
; VI-LABEL: mul64_zext_c:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x50
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v0, 0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_nop 2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v0, 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: mul64_zext_c:
@@ -772,15 +770,15 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s3, s2, 0x50
-; GFX11-NEXT: s_mul_hi_u32 s2, s2, 0x50
+; GFX11-NEXT: s_mul_i32 s0, s2, 0x50
+; GFX11-NEXT: s_mul_hi_u32 s1, s2, 0x50
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -845,100 +843,101 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: v_mul64_sext_c:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_movk_i32 s2, 0x50
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_movk_i32 s0, 0x50
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], v0, s2, 0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: v_mad_i64_i32 v[0:1], s[0:1], v0, s0, 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_nop 2
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_mul64_sext_c:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_movk_i32 s2, 0x50
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_movk_i32 s0, 0x50
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mul_hi_i32 v1, v0, s2
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: v_mul_hi_i32 v1, v0, s0
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_mul64_sext_c:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_hi_i32 v1, 0x50, v0
; GFX10-NEXT: v_mul_lo_u32 v0, 0x50, v0
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul64_sext_c:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_hi_i32 v1, 0x50, v0
; GFX11-NEXT: v_mul_lo_u32 v0, 0x50, v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_mul64_sext_c:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
+; GFX12-NEXT: s_mov_b32 s0, s4
; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null
-; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_hi_i32 v1, 0x50, v0
; GFX12-NEXT: v_mul_lo_u32 v0, 0x50, v0
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -992,100 +991,101 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: v_mul64_zext_c:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_movk_i32 s2, 0x50
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_movk_i32 s0, 0x50
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, s2, 0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, s0, 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_nop 2
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_mul64_zext_c:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_movk_i32 s2, 0x50
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_movk_i32 s0, 0x50
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mul_hi_u32 v1, v0, s2
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: v_mul_hi_u32 v1, v0, s0
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_mul64_zext_c:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_hi_u32 v1, 0x50, v0
; GFX10-NEXT: v_mul_lo_u32 v0, 0x50, v0
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul64_zext_c:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_hi_u32 v1, 0x50, v0
; GFX11-NEXT: v_mul_lo_u32 v0, 0x50, v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_mul64_zext_c:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
+; GFX12-NEXT: s_mov_b32 s0, s4
; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null
-; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_hi_u32 v1, 0x50, v0
; GFX12-NEXT: v_mul_lo_u32 v0, 0x50, v0
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1138,98 +1138,99 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad
;
; VI-LABEL: v_mul64_sext_inline_imm:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], v0, 9, 0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: v_mad_i64_i32 v[0:1], s[0:1], v0, 9, 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_nop 2
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_mul64_sext_inline_imm:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_hi_i32 v1, v0, 9
; GFX9-NEXT: v_mul_lo_u32 v0, v0, 9
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_mul64_sext_inline_imm:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_hi_i32 v1, v0, 9
; GFX10-NEXT: v_mul_lo_u32 v0, v0, 9
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul64_sext_inline_imm:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_hi_i32 v1, v0, 9
; GFX11-NEXT: v_mul_lo_u32 v0, v0, 9
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_mul64_sext_inline_imm:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
+; GFX12-NEXT: s_mov_b32 s0, s4
; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null
-; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_hi_i32 v1, 9, v0
; GFX12-NEXT: v_mul_lo_u32 v0, 9, v0
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1276,15 +1277,15 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [
;
; VI-LABEL: s_mul_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x4c
-; VI-NEXT: s_load_dword s5, s[0:1], 0x70
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x4c
+; VI-NEXT: s_load_dword s3, s[0:1], 0x70
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mul_i32 s4, s4, s5
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_mul_i32 s0, s2, s3
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_i32:
@@ -1319,13 +1320,14 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c
; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s2, s2, s3
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_mul_i32 s0, s2, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1335,13 +1337,14 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [
; GFX12-NEXT: s_clause 0x2
; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x4c
; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x70
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s6, -1
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mul_i32 s2, s2, s3
-; GFX12-NEXT: s_mov_b32 s3, 0x31016000
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: s_mov_b32 s2, -1
-; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_mul_i32 s0, s2, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1383,94 +1386,94 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; VI-LABEL: v_mul_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_lo_u32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_mul_i32:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, v0, v1
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_mul_i32:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v1
-; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_mul_i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
+; GFX12-NEXT: s_mov_b32 s0, s4
; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null
-; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v1
-; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1517,16 +1520,16 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8
;
; VI-LABEL: s_mul_i1:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x70
-; VI-NEXT: s_load_dword s5, s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x70
+; VI-NEXT: s_load_dword s3, s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mul_lo_u16_e32 v0, s5, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mul_lo_u16_e32 v0, s3, v0
; VI-NEXT: v_and_b32_e32 v0, 1, v0
-; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_i1:
@@ -1562,14 +1565,14 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c
; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mul_lo_u16 v0, s2, s3
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b8 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1579,14 +1582,14 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8
; GFX12-NEXT: s_clause 0x2
; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x4c
; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x70
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s6, -1
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_lo_u16 v0, s2, s3
-; GFX12-NEXT: s_mov_b32 s3, 0x31016000
-; GFX12-NEXT: s_mov_b32 s2, -1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT: buffer_store_b8 v0, off, s[0:3], null
+; GFX12-NEXT: buffer_store_b8 v0, off, s[4:7], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1647,109 +1650,109 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in)
;
; VI-LABEL: v_mul_i1:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; VI-NEXT: v_and_b32_e32 v0, 1, v0
-; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_mul_i1:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_mul_i1:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
; GFX10-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
-; GFX10-NEXT: s_mov_b32 s4, s0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s0, s4
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i1:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: buffer_load_u8 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_u8 v1, off, s[8:11], 0 offset:4
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: buffer_store_b8 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_mul_i1:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: buffer_load_u8 v0, off, s[8:11], null
; GFX12-NEXT: buffer_load_u8 v1, off, s[8:11], null offset:4
-; GFX12-NEXT: s_mov_b32 s4, s0
-; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_mov_b32 s0, s4
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT: buffer_store_b8 v0, off, s[4:7], null
+; GFX12-NEXT: buffer_store_b8 v0, off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1881,17 +1884,17 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s1, s6, s1
-; GFX11-NEXT: s_mul_hi_u32 s2, s6, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_i32 s1, s2, s1
-; GFX11-NEXT: s_mul_i32 s2, s7, s0
-; GFX11-NEXT: s_mul_i32 s0, s6, s0
-; GFX11-NEXT: s_add_i32 s1, s1, s2
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_mul_i32 s0, s6, s3
+; GFX11-NEXT: s_mul_hi_u32 s1, s6, s2
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_add_i32 s0, s1, s0
+; GFX11-NEXT: s_mul_i32 s1, s7, s2
+; GFX11-NEXT: s_mul_i32 s2, s6, s2
+; GFX11-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: s_mov_b32 s1, s5
@@ -1904,9 +1907,9 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mul_u64 s[0:1], s[6:7], s[0:1]
+; GFX12-NEXT: s_mul_u64 s[0:1], s[6:7], s[2:3]
; GFX12-NEXT: s_mov_b32 s7, 0x31016000
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: s_mov_b32 s6, -1
@@ -2049,20 +2052,20 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_lo_u32 v1, v2, v1
; GFX11-NEXT: v_mul_hi_u32 v4, v2, v0
@@ -2071,7 +2074,7 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v1, v4, v1
; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v3
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2080,20 +2083,20 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT: s_mov_b32 s10, -1
-; GFX12-NEXT: s_mov_b32 s11, 0x31016000
-; GFX12-NEXT: s_mov_b32 s2, s10
-; GFX12-NEXT: s_mov_b32 s3, s11
-; GFX12-NEXT: s_mov_b32 s14, s10
-; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
+; GFX12-NEXT: s_mov_b32 s14, s2
+; GFX12-NEXT: s_mov_b32 s15, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s12, s6
; GFX12-NEXT: s_mov_b32 s13, s7
-; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[0:3], null
+; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null
; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[12:15], null
-; GFX12-NEXT: s_mov_b32 s8, s4
-; GFX12-NEXT: s_mov_b32 s9, s5
+; GFX12-NEXT: s_mov_b32 s0, s4
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_lo_u32 v3, v0, v3
; GFX12-NEXT: v_mul_lo_u32 v1, v1, v2
@@ -2102,7 +2105,7 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_nc_u32_e32 v1, v3, v1
; GFX12-NEXT: v_add_nc_u32_e32 v1, v1, v4
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2181,32 +2184,32 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc0 .LBB15_2
; VI-NEXT: ; %bb.1: ; %else
-; VI-NEXT: s_mul_i32 s6, s2, s3
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_mul_i32 s8, s2, s3
+; VI-NEXT: s_mov_b64 s[2:3], 0
; VI-NEXT: s_branch .LBB15_3
; VI-NEXT: .LBB15_2:
-; VI-NEXT: s_mov_b64 s[4:5], -1
-; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: s_mov_b64 s[2:3], -1
+; VI-NEXT: ; implicit-def: $sgpr8
; VI-NEXT: .LBB15_3: ; %Flow
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; VI-NEXT: s_cbranch_vccnz .LBB15_5
; VI-NEXT: ; %bb.4: ; %if
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0
; VI-NEXT: s_branch .LBB15_6
; VI-NEXT: .LBB15_5:
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: .LBB15_6: ; %endif
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: mul32_in_branch:
@@ -2216,102 +2219,102 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: s_cbranch_scc0 .LBB15_2
; GFX9-NEXT: ; %bb.1: ; %else
-; GFX9-NEXT: s_mul_i32 s6, s2, s3
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: s_mul_i32 s8, s2, s3
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_branch .LBB15_3
; GFX9-NEXT: .LBB15_2:
-; GFX9-NEXT: s_mov_b64 s[4:5], -1
-; GFX9-NEXT: ; implicit-def: $sgpr6
+; GFX9-NEXT: s_mov_b64 s[2:3], -1
+; GFX9-NEXT: ; implicit-def: $sgpr8
; GFX9-NEXT: .LBB15_3: ; %Flow
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX9-NEXT: s_cbranch_vccnz .LBB15_5
; GFX9-NEXT: ; %bb.4: ; %if
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, s2
-; GFX9-NEXT: s_mov_b32 s5, s3
-; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s1, s7
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_branch .LBB15_6
; GFX9-NEXT: .LBB15_5:
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: .LBB15_6: ; %endif
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: mul32_in_branch:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: s_mov_b32 s8, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cmp_lg_u32 s2, 0
; GFX10-NEXT: s_cbranch_scc0 .LBB15_2
; GFX10-NEXT: ; %bb.1: ; %else
-; GFX10-NEXT: s_mul_i32 s5, s2, s3
+; GFX10-NEXT: s_mul_i32 s2, s2, s3
; GFX10-NEXT: s_branch .LBB15_3
; GFX10-NEXT: .LBB15_2:
-; GFX10-NEXT: s_mov_b32 s4, -1
-; GFX10-NEXT: ; implicit-def: $sgpr5
+; GFX10-NEXT: s_mov_b32 s8, -1
+; GFX10-NEXT: ; implicit-def: $sgpr2
; GFX10-NEXT: .LBB15_3: ; %Flow
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8
; GFX10-NEXT: s_cbranch_vccnz .LBB15_5
; GFX10-NEXT: ; %bb.4: ; %if
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s6, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s2, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s4, s2
-; GFX10-NEXT: s_mov_b32 s5, s3
-; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GFX10-NEXT: s_mov_b32 s0, s6
+; GFX10-NEXT: s_mov_b32 s1, s7
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GFX10-NEXT: s_branch .LBB15_6
; GFX10-NEXT: .LBB15_5:
-; GFX10-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: .LBB15_6: ; %endif
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: mul32_in_branch:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_mov_b32 s8, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-NEXT: s_cbranch_scc0 .LBB15_2
; GFX11-NEXT: ; %bb.1: ; %else
-; GFX11-NEXT: s_mul_i32 s5, s2, s3
+; GFX11-NEXT: s_mul_i32 s2, s2, s3
; GFX11-NEXT: s_branch .LBB15_3
; GFX11-NEXT: .LBB15_2:
-; GFX11-NEXT: s_mov_b32 s4, -1
-; GFX11-NEXT: ; implicit-def: $sgpr5
+; GFX11-NEXT: s_mov_b32 s8, -1
+; GFX11-NEXT: ; implicit-def: $sgpr2
; GFX11-NEXT: .LBB15_3: ; %Flow
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
; GFX11-NEXT: s_cbranch_vccnz .LBB15_5
; GFX11-NEXT: ; %bb.4: ; %if
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s4, s2
-; GFX11-NEXT: s_mov_b32 s5, s3
-; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: s_mov_b32 s0, s6
+; GFX11-NEXT: s_mov_b32 s1, s7
+; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_branch .LBB15_6
; GFX11-NEXT: .LBB15_5:
-; GFX11-NEXT: v_mov_b32_e32 v0, s5
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: .LBB15_6: ; %endif
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2319,36 +2322,36 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX12-LABEL: mul32_in_branch:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
-; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_mov_b32 s8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_cmp_lg_u32 s2, 0
; GFX12-NEXT: s_cbranch_scc0 .LBB15_2
; GFX12-NEXT: ; %bb.1: ; %else
-; GFX12-NEXT: s_mul_i32 s5, s2, s3
+; GFX12-NEXT: s_mul_i32 s2, s2, s3
; GFX12-NEXT: s_branch .LBB15_3
; GFX12-NEXT: .LBB15_2:
-; GFX12-NEXT: s_mov_b32 s4, -1
-; GFX12-NEXT: ; implicit-def: $sgpr5
+; GFX12-NEXT: s_mov_b32 s8, -1
+; GFX12-NEXT: ; implicit-def: $sgpr2
; GFX12-NEXT: .LBB15_3: ; %Flow
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
; GFX12-NEXT: s_cbranch_vccnz .LBB15_5
; GFX12-NEXT: ; %bb.4: ; %if
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s4, s2
-; GFX12-NEXT: s_mov_b32 s5, s3
-; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null
+; GFX12-NEXT: s_mov_b32 s0, s6
+; GFX12-NEXT: s_mov_b32 s1, s7
+; GFX12-NEXT: buffer_load_b32 v0, off, s[0:3], null
; GFX12-NEXT: s_branch .LBB15_6
; GFX12-NEXT: .LBB15_5:
-; GFX12-NEXT: v_mov_b32_e32 v0, s5
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: .LBB15_6: ; %endif
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s3, 0x31016000
-; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s6, -1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2445,31 +2448,31 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
;
; VI-LABEL: mul64_in_branch:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[8:9], 0
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u64 s[4:5], 0
+; VI-NEXT: s_cmp_lg_u64 s[8:9], 0
; VI-NEXT: s_cbranch_scc0 .LBB16_4
; VI-NEXT: ; %bb.1: ; %else
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mad_u64_u32 v[0:1], s[10:11], s4, v0, 0
-; VI-NEXT: s_mul_i32 s4, s4, s7
-; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1
-; VI-NEXT: s_mul_i32 s4, s5, s6
-; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1
-; VI-NEXT: s_andn2_b64 vcc, exec, s[8:9]
+; VI-NEXT: v_mov_b32_e32 v0, s10
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s8, v0, 0
+; VI-NEXT: s_mul_i32 s2, s8, s11
+; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; VI-NEXT: s_mul_i32 s2, s9, s10
+; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; VI-NEXT: s_andn2_b64 vcc, exec, s[0:1]
; VI-NEXT: s_cbranch_vccnz .LBB16_3
; VI-NEXT: .LBB16_2: ; %if
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; VI-NEXT: .LBB16_3: ; %endif
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: .LBB16_3: ; %endif
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB16_4:
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -2477,135 +2480,136 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
;
; GFX9-LABEL: mul64_in_branch:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_cbranch_scc0 .LBB16_3
; GFX9-NEXT: ; %bb.1: ; %else
-; GFX9-NEXT: s_mul_i32 s7, s4, s7
-; GFX9-NEXT: s_mul_hi_u32 s10, s4, s6
-; GFX9-NEXT: s_add_i32 s7, s10, s7
-; GFX9-NEXT: s_mul_i32 s5, s5, s6
-; GFX9-NEXT: s_add_i32 s5, s7, s5
-; GFX9-NEXT: s_mul_i32 s4, s4, s6
-; GFX9-NEXT: s_andn2_b64 vcc, exec, s[8:9]
+; GFX9-NEXT: s_mul_i32 s2, s8, s11
+; GFX9-NEXT: s_mul_hi_u32 s3, s8, s10
+; GFX9-NEXT: s_add_i32 s2, s3, s2
+; GFX9-NEXT: s_mul_i32 s3, s9, s10
+; GFX9-NEXT: s_add_i32 s3, s2, s3
+; GFX9-NEXT: s_mul_i32 s2, s8, s10
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[0:1]
; GFX9-NEXT: s_cbranch_vccnz .LBB16_4
; GFX9-NEXT: .LBB16_2: ; %if
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s4, s2
-; GFX9-NEXT: s_mov_b32 s5, s3
-; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s1, s7
+; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_branch .LBB16_5
; GFX9-NEXT: .LBB16_3:
-; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX9-NEXT: ; implicit-def: $sgpr2_sgpr3
; GFX9-NEXT: s_branch .LBB16_2
; GFX9-NEXT: .LBB16_4:
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: .LBB16_5: ; %endif
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: mul64_in_branch:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX10-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX10-NEXT: s_cbranch_scc0 .LBB16_3
; GFX10-NEXT: ; %bb.1: ; %else
-; GFX10-NEXT: s_mul_i32 s7, s4, s7
-; GFX10-NEXT: s_mul_hi_u32 s8, s4, s6
-; GFX10-NEXT: s_mul_i32 s5, s5, s6
-; GFX10-NEXT: s_add_i32 s7, s8, s7
-; GFX10-NEXT: s_mul_i32 s4, s4, s6
-; GFX10-NEXT: s_add_i32 s5, s7, s5
+; GFX10-NEXT: s_mul_i32 s0, s8, s11
+; GFX10-NEXT: s_mul_hi_u32 s1, s8, s10
+; GFX10-NEXT: s_mul_i32 s2, s9, s10
+; GFX10-NEXT: s_add_i32 s0, s1, s0
+; GFX10-NEXT: s_add_i32 s1, s0, s2
+; GFX10-NEXT: s_mul_i32 s0, s8, s10
; GFX10-NEXT: s_cbranch_execnz .LBB16_4
; GFX10-NEXT: .LBB16_2: ; %if
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s4, s2
-; GFX10-NEXT: s_mov_b32 s5, s3
-; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s0, s6
+; GFX10-NEXT: s_mov_b32 s1, s7
+; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT: s_branch .LBB16_5
; GFX10-NEXT: .LBB16_3:
-; GFX10-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX10-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX10-NEXT: s_branch .LBB16_2
; GFX10-NEXT: .LBB16_4:
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: .LBB16_5: ; %endif
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: mul64_in_branch:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX11-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX11-NEXT: s_cbranch_scc0 .LBB16_3
; GFX11-NEXT: ; %bb.1: ; %else
-; GFX11-NEXT: s_mul_i32 s7, s4, s7
-; GFX11-NEXT: s_mul_hi_u32 s8, s4, s6
-; GFX11-NEXT: s_mul_i32 s5, s5, s6
-; GFX11-NEXT: s_add_i32 s7, s8, s7
-; GFX11-NEXT: s_mul_i32 s4, s4, s6
-; GFX11-NEXT: s_add_i32 s5, s7, s5
+; GFX11-NEXT: s_mul_i32 s0, s8, s11
+; GFX11-NEXT: s_mul_hi_u32 s1, s8, s10
+; GFX11-NEXT: s_mul_i32 s2, s9, s10
+; GFX11-NEXT: s_add_i32 s0, s1, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s1, s0, s2
+; GFX11-NEXT: s_mul_i32 s0, s8, s10
; GFX11-NEXT: s_cbranch_execnz .LBB16_4
; GFX11-NEXT: .LBB16_2: ; %if
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s4, s2
-; GFX11-NEXT: s_mov_b32 s5, s3
-; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s0, s6
+; GFX11-NEXT: s_mov_b32 s1, s7
+; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_branch .LBB16_5
; GFX11-NEXT: .LBB16_3:
-; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX11-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX11-NEXT: s_branch .LBB16_2
; GFX11-NEXT: .LBB16_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: .LBB16_5: ; %endif
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: mul64_in_branch:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX12-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX12-NEXT: s_cbranch_scc0 .LBB16_3
; GFX12-NEXT: ; %bb.1: ; %else
-; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[6:7]
+; GFX12-NEXT: s_mul_u64 s[0:1], s[8:9], s[10:11]
; GFX12-NEXT: s_cbranch_execnz .LBB16_4
; GFX12-NEXT: .LBB16_2: ; %if
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s4, s2
-; GFX12-NEXT: s_mov_b32 s5, s3
-; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s0, s6
+; GFX12-NEXT: s_mov_b32 s1, s7
+; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_branch .LBB16_5
; GFX12-NEXT: .LBB16_3:
-; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX12-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX12-NEXT: s_branch .LBB16_2
; GFX12-NEXT: .LBB16_4:
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: .LBB16_5: ; %endif
-; GFX12-NEXT: s_mov_b32 s3, 0x31016000
-; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s6, -1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2724,41 +2728,41 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
;
; VI-LABEL: s_mul_i128:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x7c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v5, 0
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mad_u64_u32 v[2:3], s[12:13], s8, v0, 0
-; VI-NEXT: s_mul_i32 s7, s8, s7
-; VI-NEXT: v_mov_b32_e32 v6, s8
-; VI-NEXT: v_add_u32_e32 v3, vcc, s7, v3
-; VI-NEXT: s_mul_i32 s12, s9, s6
-; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s4, v6, 0
-; VI-NEXT: v_add_u32_e32 v3, vcc, s12, v3
+; VI-NEXT: v_mov_b32_e32 v0, s10
+; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v0, 0
+; VI-NEXT: s_mul_i32 s0, s12, s11
+; VI-NEXT: v_mov_b32_e32 v6, s12
+; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v3
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
+; VI-NEXT: s_mul_i32 s2, s13, s10
+; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v3
; VI-NEXT: v_mov_b32_e32 v4, v1
-; VI-NEXT: v_mad_u64_u32 v[6:7], s[6:7], s5, v6, v[4:5]
-; VI-NEXT: v_mov_b32_e32 v8, s4
-; VI-NEXT: v_mad_u64_u32 v[1:2], s[6:7], s10, v8, v[2:3]
+; VI-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s9, v6, v[4:5]
+; VI-NEXT: v_mov_b32_e32 v8, s8
+; VI-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v8, v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, v7
; VI-NEXT: v_mov_b32_e32 v7, v5
-; VI-NEXT: v_mov_b32_e32 v8, s9
-; VI-NEXT: v_mad_u64_u32 v[4:5], s[6:7], s4, v8, v[6:7]
-; VI-NEXT: s_mul_i32 s8, s11, s4
-; VI-NEXT: v_add_u32_e32 v6, vcc, s8, v2
+; VI-NEXT: v_mov_b32_e32 v8, s13
+; VI-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v8, v[6:7]
+; VI-NEXT: s_mul_i32 s2, s15, s8
+; VI-NEXT: v_add_u32_e32 v6, vcc, s2, v2
; VI-NEXT: v_mov_b32_e32 v2, v5
; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; VI-NEXT: v_addc_u32_e64 v3, s[6:7], 0, 0, vcc
-; VI-NEXT: s_mul_i32 s8, s10, s5
-; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s5, v8, v[2:3]
-; VI-NEXT: v_add_u32_e32 v5, vcc, s8, v6
+; VI-NEXT: v_addc_u32_e64 v3, s[0:1], 0, 0, vcc
+; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s9, v8, v[2:3]
+; VI-NEXT: s_mul_i32 s2, s14, s9
+; VI-NEXT: v_add_u32_e32 v5, vcc, s2, v6
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
; VI-NEXT: v_mov_b32_e32 v1, v4
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_i128:
@@ -2813,53 +2817,53 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
;
; GFX10-LABEL: s_mul_i128:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c
; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c
+; GFX10-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
; GFX10-NEXT: s_mov_b32 s2, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s13, s2
+; GFX10-NEXT: s_mov_b32 s1, s2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mul_i32 s3, s8, s7
-; GFX10-NEXT: s_mul_hi_u32 s7, s8, s6
+; GFX10-NEXT: s_mul_i32 s0, s8, s7
+; GFX10-NEXT: s_mul_hi_u32 s3, s8, s6
; GFX10-NEXT: s_mul_i32 s14, s10, s5
; GFX10-NEXT: s_mul_hi_u32 s15, s10, s4
-; GFX10-NEXT: s_mul_i32 s12, s9, s6
+; GFX10-NEXT: s_mul_i32 s7, s9, s6
; GFX10-NEXT: s_mul_i32 s11, s11, s4
-; GFX10-NEXT: s_add_i32 s3, s7, s3
-; GFX10-NEXT: s_add_i32 s7, s15, s14
+; GFX10-NEXT: s_add_i32 s0, s3, s0
+; GFX10-NEXT: s_add_i32 s3, s15, s14
; GFX10-NEXT: s_mul_i32 s6, s8, s6
; GFX10-NEXT: s_mul_i32 s10, s10, s4
-; GFX10-NEXT: s_add_i32 s3, s3, s12
-; GFX10-NEXT: s_add_i32 s7, s7, s11
+; GFX10-NEXT: s_add_i32 s0, s0, s7
+; GFX10-NEXT: s_add_i32 s3, s3, s11
; GFX10-NEXT: s_mul_i32 s19, s5, s8
; GFX10-NEXT: s_mul_hi_u32 s20, s4, s8
; GFX10-NEXT: s_add_u32 s6, s10, s6
; GFX10-NEXT: s_mul_hi_u32 s18, s5, s8
-; GFX10-NEXT: s_addc_u32 s7, s7, s3
+; GFX10-NEXT: s_addc_u32 s7, s3, s0
; GFX10-NEXT: s_mul_i32 s17, s4, s9
-; GFX10-NEXT: s_add_u32 s3, s19, s20
+; GFX10-NEXT: s_add_u32 s0, s19, s20
; GFX10-NEXT: s_mul_hi_u32 s16, s4, s9
; GFX10-NEXT: s_mul_hi_u32 s21, s5, s9
; GFX10-NEXT: s_mul_i32 s5, s5, s9
; GFX10-NEXT: s_addc_u32 s9, s18, 0
-; GFX10-NEXT: s_add_u32 s3, s17, s3
+; GFX10-NEXT: s_add_u32 s3, s17, s0
; GFX10-NEXT: s_addc_u32 s10, s16, 0
-; GFX10-NEXT: s_mul_i32 s12, s4, s8
+; GFX10-NEXT: s_mul_i32 s0, s4, s8
; GFX10-NEXT: s_add_u32 s4, s9, s10
; GFX10-NEXT: s_addc_u32 s8, 0, 0
; GFX10-NEXT: s_add_u32 s4, s5, s4
; GFX10-NEXT: s_addc_u32 s5, s21, s8
; GFX10-NEXT: s_add_u32 s4, s4, s6
; GFX10-NEXT: s_addc_u32 s5, s5, s7
-; GFX10-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3]
+; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX10-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_mov_b32_e32 v3, s5
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
-; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-NEXT: s_mov_b32 s15, 0x31016000
+; GFX10-NEXT: s_mov_b32 s14, -1
+; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_mul_i128:
@@ -2867,50 +2871,50 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x4c
; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x7c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[12:13], s[0:1], 0x24
; GFX11-NEXT: s_mov_b32 s2, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_mov_b32 s13, s2
+; GFX11-NEXT: s_mov_b32 s1, s2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s3, s8, s7
-; GFX11-NEXT: s_mul_hi_u32 s7, s8, s6
+; GFX11-NEXT: s_mul_i32 s0, s8, s7
+; GFX11-NEXT: s_mul_hi_u32 s3, s8, s6
; GFX11-NEXT: s_mul_i32 s14, s10, s5
; GFX11-NEXT: s_mul_hi_u32 s15, s10, s4
-; GFX11-NEXT: s_mul_i32 s12, s9, s6
+; GFX11-NEXT: s_mul_i32 s7, s9, s6
; GFX11-NEXT: s_mul_i32 s11, s11, s4
-; GFX11-NEXT: s_add_i32 s3, s7, s3
-; GFX11-NEXT: s_add_i32 s7, s15, s14
+; GFX11-NEXT: s_add_i32 s0, s3, s0
+; GFX11-NEXT: s_add_i32 s3, s15, s14
; GFX11-NEXT: s_mul_i32 s6, s8, s6
; GFX11-NEXT: s_mul_i32 s10, s10, s4
-; GFX11-NEXT: s_add_i32 s3, s3, s12
-; GFX11-NEXT: s_add_i32 s7, s7, s11
+; GFX11-NEXT: s_add_i32 s0, s0, s7
+; GFX11-NEXT: s_add_i32 s3, s3, s11
; GFX11-NEXT: s_mul_i32 s19, s5, s8
; GFX11-NEXT: s_mul_hi_u32 s20, s4, s8
; GFX11-NEXT: s_add_u32 s6, s10, s6
; GFX11-NEXT: s_mul_hi_u32 s18, s5, s8
-; GFX11-NEXT: s_addc_u32 s7, s7, s3
+; GFX11-NEXT: s_addc_u32 s7, s3, s0
; GFX11-NEXT: s_mul_i32 s17, s4, s9
-; GFX11-NEXT: s_add_u32 s3, s19, s20
+; GFX11-NEXT: s_add_u32 s0, s19, s20
; GFX11-NEXT: s_mul_hi_u32 s16, s4, s9
; GFX11-NEXT: s_mul_hi_u32 s21, s5, s9
; GFX11-NEXT: s_mul_i32 s5, s5, s9
; GFX11-NEXT: s_addc_u32 s9, s18, 0
-; GFX11-NEXT: s_add_u32 s3, s17, s3
+; GFX11-NEXT: s_add_u32 s3, s17, s0
; GFX11-NEXT: s_addc_u32 s10, s16, 0
-; GFX11-NEXT: s_mul_i32 s12, s4, s8
+; GFX11-NEXT: s_mul_i32 s0, s4, s8
; GFX11-NEXT: s_add_u32 s4, s9, s10
; GFX11-NEXT: s_addc_u32 s8, 0, 0
; GFX11-NEXT: s_add_u32 s4, s5, s4
; GFX11-NEXT: s_addc_u32 s5, s21, s8
; GFX11-NEXT: s_add_u32 s4, s4, s6
; GFX11-NEXT: s_addc_u32 s5, s5, s7
-; GFX11-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3]
+; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s5
+; GFX11-NEXT: s_mov_b32 s15, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, -1
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[12:15], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2918,44 +2922,44 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
; GFX12-LABEL: s_mul_i128:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x7c
-; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x4c
+; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x7c
+; GFX12-NEXT: s_load_b128 s[12:15], s[0:1], 0x4c
; GFX12-NEXT: s_mov_b32 s3, 0
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s15, s3
-; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s7, s3
+; GFX12-NEXT: s_mov_b32 s1, s3
; GFX12-NEXT: s_mov_b32 s17, s3
; GFX12-NEXT: s_mov_b32 s19, s3
; GFX12-NEXT: s_mov_b32 s24, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s2, s4
-; GFX12-NEXT: s_mov_b32 s14, s8
-; GFX12-NEXT: s_mov_b32 s12, s9
-; GFX12-NEXT: s_mul_u64 s[22:23], s[14:15], s[2:3]
-; GFX12-NEXT: s_mul_u64 s[20:21], s[12:13], s[2:3]
+; GFX12-NEXT: s_mov_b32 s2, s8
+; GFX12-NEXT: s_mov_b32 s6, s12
+; GFX12-NEXT: s_mov_b32 s0, s13
+; GFX12-NEXT: s_mul_u64 s[22:23], s[6:7], s[2:3]
+; GFX12-NEXT: s_mul_u64 s[20:21], s[0:1], s[2:3]
; GFX12-NEXT: s_mov_b32 s2, s23
-; GFX12-NEXT: s_mov_b32 s16, s5
-; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[10:11]
-; GFX12-NEXT: s_add_nc_u64 s[10:11], s[20:21], s[2:3]
-; GFX12-NEXT: s_mul_u64 s[6:7], s[6:7], s[8:9]
-; GFX12-NEXT: s_mul_u64 s[8:9], s[14:15], s[16:17]
-; GFX12-NEXT: s_mov_b32 s2, s11
-; GFX12-NEXT: s_mov_b32 s11, s3
-; GFX12-NEXT: s_add_nc_u64 s[4:5], s[6:7], s[4:5]
-; GFX12-NEXT: s_add_nc_u64 s[6:7], s[8:9], s[10:11]
-; GFX12-NEXT: s_mul_u64 s[12:13], s[12:13], s[16:17]
+; GFX12-NEXT: s_mov_b32 s16, s9
+; GFX12-NEXT: s_mul_u64 s[10:11], s[10:11], s[12:13]
+; GFX12-NEXT: s_add_nc_u64 s[12:13], s[20:21], s[2:3]
+; GFX12-NEXT: s_mul_u64 s[6:7], s[6:7], s[16:17]
+; GFX12-NEXT: s_mov_b32 s2, s13
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mul_u64 s[8:9], s[8:9], s[14:15]
+; GFX12-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[12:13]
+; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[16:17]
; GFX12-NEXT: s_mov_b32 s18, s7
; GFX12-NEXT: s_mov_b32 s23, s3
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[18:19]
+; GFX12-NEXT: s_add_nc_u64 s[8:9], s[10:11], s[8:9]
; GFX12-NEXT: s_mov_b32 s25, s6
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[12:13], s[2:3]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
; GFX12-NEXT: s_or_b64 s[6:7], s[22:23], s[24:25]
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[8:9]
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: s_mov_b32 s3, 0x31016000
-; GFX12-NEXT: s_mov_b32 s2, -1
-; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null
+; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[4:7], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3067,15 +3071,15 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
;
; VI-LABEL: v_mul_i128:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0
; VI-NEXT: v_mov_b32_e32 v11, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_add_u32_e32 v8, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_add_u32_e32 v8, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[8:9]
@@ -3107,12 +3111,12 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
;
; GFX9-LABEL: v_mul_i128:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX9-NEXT: v_lshlrev_b32_e32 v13, 4, v0
; GFX9-NEXT: v_mov_b32_e32 v10, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1]
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3]
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v13, s[4:5]
+; GFX9-NEXT: global_load_dwordx4 v[4:7], v13, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0
; GFX9-NEXT: v_mul_lo_u32 v14, v5, v2
@@ -3133,18 +3137,18 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
; GFX9-NEXT: v_add3_u32 v3, v16, v3, v4
; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v3, vcc
-; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3]
+; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_mul_i128:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v13, 4, v0
; GFX10-NEXT: v_mov_b32_e32 v10, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1]
-; GFX10-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3]
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v13, s[4:5]
+; GFX10-NEXT: global_load_dwordx4 v[4:7], v13, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mad_u64_u32 v[8:9], s0, v0, v4, 0
; GFX10-NEXT: v_mul_lo_u32 v15, v5, v2
@@ -3165,17 +3169,17 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
; GFX10-NEXT: v_add3_u32 v3, v7, v3, v12
; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3]
+; GFX10-NEXT: global_store_dwordx4 v13, v[8:11], s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i128:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b128 v[0:3], v15, s[0:1]
-; GFX11-NEXT: global_load_b128 v[4:7], v15, s[2:3]
+; GFX11-NEXT: global_load_b128 v[0:3], v15, s[4:5]
+; GFX11-NEXT: global_load_b128 v[4:7], v15, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v0, v4, 0
; GFX11-NEXT: v_mul_lo_u32 v14, v5, v2
@@ -3201,19 +3205,19 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v6, v13
; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v7, v0, vcc_lo
-; GFX11-NEXT: global_store_b128 v15, v[8:11], s[2:3]
+; GFX11-NEXT: global_store_b128 v15, v[8:11], s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_mul_i128:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
; GFX12-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v13, 4, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_load_b128 v[0:3], v13, s[0:1]
-; GFX12-NEXT: global_load_b128 v[4:7], v13, s[2:3]
+; GFX12-NEXT: global_load_b128 v[0:3], v13, s[4:5]
+; GFX12-NEXT: global_load_b128 v[4:7], v13, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v0, v4, 0
; GFX12-NEXT: v_mul_lo_u32 v15, v5, v2
@@ -3240,7 +3244,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2
; GFX12-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
-; GFX12-NEXT: global_store_b128 v13, v[8:11], s[2:3]
+; GFX12-NEXT: global_store_b128 v13, v[8:11], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index 6d7bf00..4770b44 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -291,18 +291,18 @@ define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32
;
; VI-LABEL: test_smul24_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x4c
-; VI-NEXT: s_load_dword s5, s[0:1], 0x70
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x4c
+; VI-NEXT: s_load_dword s3, s[0:1], 0x70
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_i32 s4, s4, 0x180000
-; VI-NEXT: s_bfe_i32 s5, s5, 0x180000
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s5, v0
-; VI-NEXT: v_mul_i32_i24_e32 v0, s5, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_bfe_i32 s0, s2, 0x180000
+; VI-NEXT: s_bfe_i32 s1, s3, 0x180000
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s1, v0
+; VI-NEXT: v_mul_i32_i24_e32 v0, s1, v0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smul24_i64:
@@ -390,15 +390,15 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a,
;
; VI-LABEL: test_smul24_i64_square:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_i32 s4, s4, 0x180000
-; VI-NEXT: v_mul_hi_i32_i24_e64 v1, s4, s4
-; VI-NEXT: v_mul_i32_i24_e64 v0, s4, s4
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_bfe_i32 s0, s2, 0x180000
+; VI-NEXT: v_mul_hi_i32_i24_e64 v1, s0, s0
+; VI-NEXT: v_mul_i32_i24_e64 v0, s0, s0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smul24_i64_square:
@@ -485,21 +485,21 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b
; VI-LABEL: test_smul24_i33:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s3, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s3, s2, 8
-; VI-NEXT: s_lshl_b32 s5, s4, 8
-; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40
+; VI-NEXT: s_lshl_b32 s1, s2, 8
+; VI-NEXT: s_lshl_b32 s3, s3, 8
; VI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s2, v0
-; VI-NEXT: v_mul_i32_i24_e32 v0, s2, v0
+; VI-NEXT: s_ashr_i64 s[0:1], s[0:1], 40
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0
+; VI-NEXT: v_mul_i32_i24_e32 v0, s0, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
-; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1]
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smul24_i33:
@@ -594,16 +594,16 @@ define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33
;
; VI-LABEL: test_smulhi24_i33:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dword s5, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dword s3, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s5, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s3, v0
; VI-NEXT: v_and_b32_e32 v0, 1, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smulhi24_i33:
@@ -702,16 +702,16 @@ define amdgpu_kernel void @simplify_i24_crash(ptr addrspace(1) %out, i32 %arg0,
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB8_2: ; %bb11
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_i32 s4, s4, 0x180000
-; VI-NEXT: s_bfe_i32 s5, s6, 0x180000
-; VI-NEXT: s_mul_i32 s4, s4, s5
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_bfe_i32 s0, s4, 0x180000
+; VI-NEXT: s_bfe_i32 s1, s6, 0x180000
+; VI-NEXT: s_mul_i32 s0, s0, s1
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: simplify_i24_crash:
diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index e6470a5..7c43c0b 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -78,16 +78,16 @@ define amdgpu_kernel void @test_umul24_i16_sext(ptr addrspace(1) %out, i16 %a, i
;
; VI-LABEL: test_umul24_i16_sext:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s5, s4, 16
-; VI-NEXT: s_mul_i32 s4, s4, s5
-; VI-NEXT: s_sext_i32_i16 s4, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_lshr_b32 s0, s2, 16
+; VI-NEXT: s_mul_i32 s2, s2, s0
+; VI-NEXT: s_sext_i32_i16 s0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i16_sext:
@@ -136,40 +136,40 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr
;
; VI-LABEL: test_umul24_i16_vgpr_sext:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v2, v[2:3]
; VI-NEXT: flat_load_ushort v0, v[0:1]
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_lo_u16_e32 v0, v2, v0
; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i16_vgpr_sext:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v2, v0, s[2:3]
-; GFX9-NEXT: global_load_ushort v3, v1, s[2:3]
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
+; GFX9-NEXT: global_load_ushort v3, v1, s[6:7]
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v2, v3
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%tid.y = call i32 @llvm.amdgcn.workitem.id.y()
@@ -200,16 +200,16 @@ define amdgpu_kernel void @test_umul24_i16(ptr addrspace(1) %out, i16 %a, i16 %b
;
; VI-LABEL: test_umul24_i16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s5, s4, 16
-; VI-NEXT: s_mul_i32 s4, s4, s5
-; VI-NEXT: s_and_b32 s4, s4, 0xffff
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_lshr_b32 s0, s2, 16
+; VI-NEXT: s_mul_i32 s2, s2, s0
+; VI-NEXT: s_and_b32 s0, s2, 0xffff
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i16:
@@ -258,38 +258,38 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_umul24_i16_vgpr:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v2, v[2:3]
; VI-NEXT: flat_load_ushort v0, v[0:1]
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_lo_u16_e32 v0, v2, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i16_vgpr:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v2, v0, s[2:3]
-; GFX9-NEXT: global_load_ushort v3, v1, s[2:3]
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
+; GFX9-NEXT: global_load_ushort v3, v1, s[6:7]
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v2, v3
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%tid.y = call i32 @llvm.amdgcn.workitem.id.y()
@@ -331,13 +331,13 @@ define amdgpu_kernel void @test_umul24_i8_vgpr(ptr addrspace(1) %out, ptr addrsp
; VI-LABEL: test_umul24_i8_vgpr:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: v_mov_b32_e32 v4, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v1
+; VI-NEXT: v_mov_b32_e32 v4, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc
; VI-NEXT: flat_load_ubyte v2, v[2:3]
; VI-NEXT: flat_load_ubyte v0, v[0:1]
@@ -596,14 +596,14 @@ define amdgpu_kernel void @test_umul24_i64_square(ptr addrspace(1) %out, [8 x i3
;
; VI-LABEL: test_umul24_i64_square:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mul_hi_u32_u24_e64 v1, s4, s4
-; VI-NEXT: v_mul_u32_u24_e64 v0, s4, s4
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: v_mul_hi_u32_u24_e64 v1, s2, s2
+; VI-NEXT: v_mul_u32_u24_e64 v0, s2, s2
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i64_square:
@@ -703,17 +703,17 @@ define amdgpu_kernel void @test_umul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b
;
; VI-LABEL: test_umul24_i33:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dword s5, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dword s3, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: v_mul_u32_u24_e32 v0, s5, v1
-; VI-NEXT: v_mul_hi_u32_u24_e32 v1, s5, v1
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mul_u32_u24_e32 v0, s3, v1
+; VI-NEXT: v_mul_hi_u32_u24_e32 v1, s3, v1
; VI-NEXT: v_and_b32_e32 v1, 1, v1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i33:
@@ -761,16 +761,16 @@ define amdgpu_kernel void @test_umulhi24_i33(ptr addrspace(1) %out, i33 %a, i33
;
; VI-LABEL: test_umulhi24_i33:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dword s5, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dword s3, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s5, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s3, v0
; VI-NEXT: v_and_b32_e32 v0, 1, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umulhi24_i33:
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
index 9ab3ecc..28f6c13 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
@@ -2104,10 +2104,10 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) {
; GFX9-LABEL: flat_inst_salu_offset_1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:1 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_store_byte v[0:1], v0
@@ -2115,10 +2115,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 1
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_add_u32 s0, s2, 1
+; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2128,9 +2128,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) {
;
; GFX11-LABEL: flat_inst_salu_offset_1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:1 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_store_b8 v[0:1], v0
@@ -2138,9 +2138,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:1 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2154,10 +2154,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) {
; GFX9-LABEL: flat_inst_salu_offset_11bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_store_byte v[0:1], v0
@@ -2165,10 +2165,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_11bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2178,9 +2178,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) {
;
; GFX11-LABEL: flat_inst_salu_offset_11bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:2047 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_store_b8 v[0:1], v0
@@ -2188,9 +2188,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_11bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2204,10 +2204,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) {
; GFX9-LABEL: flat_inst_salu_offset_12bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_store_byte v[0:1], v0
@@ -2215,10 +2215,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_12bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2228,9 +2228,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) {
;
; GFX11-LABEL: flat_inst_salu_offset_12bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_store_b8 v[0:1], v0
@@ -2238,9 +2238,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_12bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2254,10 +2254,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_13bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc
@@ -2267,10 +2267,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_13bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2280,11 +2280,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_13bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2292,9 +2292,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_13bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2302,10 +2302,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_13bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2315,10 +2315,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_13bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -2334,10 +2334,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_neg_11bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2347,10 +2347,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_neg_11bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff800
-; GFX10-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-NEXT: s_add_u32 s0, s2, 0xfffff800
+; GFX10-NEXT: s_addc_u32 s1, s3, -1
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2360,11 +2360,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_11bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff800, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff800, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2372,9 +2372,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_neg_11bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-2048 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2382,10 +2382,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_11bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff800
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0xfffff800
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2395,10 +2395,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_neg_11bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff800
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xfffff800
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -2414,10 +2414,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_neg_12bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2427,10 +2427,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_neg_12bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff000
-; GFX10-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-NEXT: s_add_u32 s0, s2, 0xfffff000
+; GFX10-NEXT: s_addc_u32 s1, s3, -1
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2440,11 +2440,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_12bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2452,9 +2452,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_neg_12bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2462,10 +2462,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_12bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0xfffff000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2475,10 +2475,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_neg_12bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xfffff000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -2494,10 +2494,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_neg_13bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2507,10 +2507,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_neg_13bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX10-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX10-NEXT: s_addc_u32 s1, s3, -1
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2520,11 +2520,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_13bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2532,9 +2532,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_neg_13bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2542,10 +2542,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_13bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2555,10 +2555,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_neg_13bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -2574,10 +2574,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) {
; GFX9-LABEL: flat_inst_salu_offset_2x_11bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_store_byte v[0:1], v0
@@ -2585,10 +2585,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_2x_11bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2598,9 +2598,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) {
;
; GFX11-LABEL: flat_inst_salu_offset_2x_11bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_store_b8 v[0:1], v0
@@ -2608,9 +2608,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_2x_11bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2624,10 +2624,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_12bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc
@@ -2637,10 +2637,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_2x_12bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2650,11 +2650,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_12bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2662,9 +2662,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_2x_12bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2672,10 +2672,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_12bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2685,10 +2685,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_12bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -2704,10 +2704,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_13bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc
@@ -2717,10 +2717,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_2x_13bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x3fff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_add_u32 s0, s2, 0x3fff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2730,11 +2730,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_13bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x3000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x3000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2742,9 +2742,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_2x_13bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:16383 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2752,10 +2752,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_13bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x3fff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x3fff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2765,10 +2765,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_13bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x3fff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x3fff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -2784,10 +2784,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2797,10 +2797,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff000
-; GFX10-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-NEXT: s_add_u32 s0, s2, 0xfffff000
+; GFX10-NEXT: s_addc_u32 s1, s3, -1
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2810,11 +2810,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2822,9 +2822,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2832,10 +2832,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0xfffff000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2845,10 +2845,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xfffff000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -2864,10 +2864,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2877,10 +2877,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX10-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX10-NEXT: s_addc_u32 s1, s3, -1
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2890,11 +2890,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2902,9 +2902,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2912,10 +2912,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2925,10 +2925,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -2944,10 +2944,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2957,10 +2957,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xffffc000
-; GFX10-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-NEXT: s_add_u32 s0, s2, 0xffffc000
+; GFX10-NEXT: s_addc_u32 s1, s3, -1
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2970,11 +2970,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2982,9 +2982,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-16384 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2992,10 +2992,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0xffffc000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3005,10 +3005,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xffffc000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3025,10 +3025,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3037,10 +3037,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX10-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX10-NEXT: s_addc_u32 s1, s3, 2
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3050,11 +3050,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2047 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3062,11 +3062,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3074,10 +3074,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3087,10 +3087,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3100,10 +3100,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x7ff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3120,10 +3120,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:2048 glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3132,10 +3132,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x800
-; GFX10-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-NEXT: s_add_u32 s0, s2, 0x800
+; GFX10-NEXT: s_addc_u32 s1, s3, 2
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3145,11 +3145,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2048 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3157,11 +3157,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2048 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3169,10 +3169,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x800
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x800
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3182,10 +3182,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x800
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x800
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3195,10 +3195,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x800
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3215,10 +3215,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3227,10 +3227,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX10-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX10-NEXT: s_addc_u32 s1, s3, 2
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3240,11 +3240,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3252,11 +3252,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3264,10 +3264,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3277,10 +3277,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3290,10 +3290,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3310,10 +3310,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3323,10 +3323,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX10-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX10-NEXT: s_addc_u32 s1, s3, 2
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3336,11 +3336,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3348,11 +3348,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4096 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3360,10 +3360,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3373,10 +3373,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3386,10 +3386,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1000
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3406,10 +3406,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc
@@ -3419,10 +3419,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX10-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX10-NEXT: s_addc_u32 s1, s3, 2
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3432,11 +3432,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3444,11 +3444,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3456,10 +3456,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3469,10 +3469,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3482,10 +3482,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1fff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3502,10 +3502,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3515,10 +3515,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX10-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX10-NEXT: s_addc_u32 s1, s3, 2
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3528,11 +3528,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3540,11 +3540,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8192 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3552,10 +3552,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3565,10 +3565,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3578,10 +3578,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x2000
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3598,11 +3598,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x7ff, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3612,10 +3612,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3625,10 +3625,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, s0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3638,10 +3638,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386561 scope:SCOPE_SYS
@@ -3651,10 +3651,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3664,10 +3664,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3677,10 +3677,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x7ff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3697,11 +3697,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x800, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3711,10 +3711,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x800
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x800
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3724,10 +3724,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, s0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3737,10 +3737,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386560 scope:SCOPE_SYS
@@ -3750,10 +3750,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x800
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x800
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3763,10 +3763,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x800
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x800
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3776,10 +3776,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x800
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3796,11 +3796,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfff, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3810,10 +3810,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3823,10 +3823,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, s0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3836,10 +3836,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384513 scope:SCOPE_SYS
@@ -3849,10 +3849,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3862,10 +3862,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3875,10 +3875,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3895,11 +3895,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3909,10 +3909,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3922,10 +3922,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, s0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3935,10 +3935,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384512 scope:SCOPE_SYS
@@ -3948,10 +3948,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3961,10 +3961,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3974,10 +3974,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1000
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3994,11 +3994,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x1fff, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -4008,10 +4008,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -4021,10 +4021,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, s0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -4034,10 +4034,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380417 scope:SCOPE_SYS
@@ -4047,10 +4047,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -4060,10 +4060,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -4073,10 +4073,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1fff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -4093,11 +4093,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -4107,10 +4107,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -4120,10 +4120,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, s0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -4133,10 +4133,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380416 scope:SCOPE_SYS
@@ -4146,10 +4146,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -4159,10 +4159,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -4172,10 +4172,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x2000
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
index 10381bc..8dcca32 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
@@ -2176,30 +2176,30 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_inst_salu_offset_1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1 glc dlc
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_inst_salu_offset_1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:1 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2208,10 +2208,10 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) {
;
; GFX12-LABEL: global_inst_salu_offset_1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:1 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2226,30 +2226,30 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) {
define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_11bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_inst_salu_offset_11bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_inst_salu_offset_11bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2047 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2258,10 +2258,10 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p
;
; GFX12-LABEL: global_inst_salu_offset_11bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:2047 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2276,30 +2276,30 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p
define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_12bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_inst_salu_offset_12bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0x800
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_inst_salu_offset_12bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2308,10 +2308,10 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p
;
; GFX12-LABEL: global_inst_salu_offset_12bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2326,30 +2326,30 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p
define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_13bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x1000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_inst_salu_offset_13bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0x1800
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_inst_salu_offset_13bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2358,10 +2358,10 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p
;
; GFX12-LABEL: global_inst_salu_offset_13bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:8191 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2376,30 +2376,30 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p
define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_neg_11bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_inst_salu_offset_neg_11bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048 glc dlc
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_inst_salu_offset_neg_11bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2048 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2408,10 +2408,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1
;
; GFX12-LABEL: global_inst_salu_offset_neg_11bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2048 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2426,20 +2426,20 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1
define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_neg_12bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-4096 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_neg_12bit_max:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0xfffff000
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -2449,10 +2449,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1
;
; GFX11-LABEL: global_inst_salu_offset_neg_12bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2461,10 +2461,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1
;
; GFX12-LABEL: global_inst_salu_offset_neg_12bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2473,10 +2473,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_12bit_max:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -2490,11 +2490,11 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1
define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_neg_13bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX9-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX9-NEXT: s_addc_u32 s1, s3, -1
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -2502,10 +2502,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_neg_13bit_max:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -2515,10 +2515,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_neg_13bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -2530,10 +2530,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
;
; GFX12-LABEL: global_inst_salu_offset_neg_13bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-8192 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2542,10 +2542,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_13bit_max:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -2553,11 +2553,11 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_neg_13bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -2573,30 +2573,30 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_2x_11bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_inst_salu_offset_2x_11bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0x800
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_inst_salu_offset_2x_11bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2605,10 +2605,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1)
;
; GFX12-LABEL: global_inst_salu_offset_2x_11bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2623,30 +2623,30 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1)
define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_2x_12bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x1000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_inst_salu_offset_2x_12bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0x1800
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_inst_salu_offset_2x_12bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2655,10 +2655,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1)
;
; GFX12-LABEL: global_inst_salu_offset_2x_12bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:8191 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2673,30 +2673,30 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1)
define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_2x_13bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_inst_salu_offset_2x_13bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0x3800
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_inst_salu_offset_2x_13bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0x3000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2705,10 +2705,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1)
;
; GFX12-LABEL: global_inst_salu_offset_2x_13bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:16383 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:16383 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2723,20 +2723,20 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1)
define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-4096 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0xfffff000
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -2746,10 +2746,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac
;
; GFX11-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2758,10 +2758,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac
;
; GFX12-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2770,10 +2770,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -2787,11 +2787,11 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac
define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX9-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX9-NEXT: s_addc_u32 s1, s3, -1
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -2799,10 +2799,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -2812,10 +2812,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -2827,10 +2827,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
;
; GFX12-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-8192 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2839,10 +2839,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -2850,11 +2850,11 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -2870,11 +2870,11 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0xffffc000
-; GFX9-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-NEXT: s_add_u32 s0, s2, 0xffffc000
+; GFX9-NEXT: s_addc_u32 s1, s3, -1
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -2882,10 +2882,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0xffffc000
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -2895,10 +2895,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xffffc000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -2910,10 +2910,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
;
; GFX12-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-16384 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-16384 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2922,10 +2922,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -2933,11 +2933,11 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -2954,11 +2954,11 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX9-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX9-NEXT: s_addc_u32 s1, s3, 2
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -2966,10 +2966,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -2979,10 +2979,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -2994,10 +2994,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x7ff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
@@ -3009,10 +3009,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -3020,11 +3020,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3034,11 +3034,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3055,11 +3055,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x800
-; GFX9-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-NEXT: s_add_u32 s0, s2, 0x800
+; GFX9-NEXT: s_addc_u32 s1, s3, 2
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3067,10 +3067,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x800
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0x800
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -3080,10 +3080,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x800
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x800
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -3095,10 +3095,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x800
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
@@ -3110,10 +3110,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x800, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x800, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -3121,11 +3121,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3135,11 +3135,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3156,11 +3156,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX9-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX9-NEXT: s_addc_u32 s1, s3, 2
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3168,10 +3168,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -3181,10 +3181,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -3196,10 +3196,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
@@ -3211,10 +3211,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x800, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x800, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -3222,11 +3222,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3236,11 +3236,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3257,11 +3257,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX9-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX9-NEXT: s_addc_u32 s1, s3, 2
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3269,10 +3269,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -3282,10 +3282,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -3297,10 +3297,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1000
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
@@ -3312,10 +3312,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -3323,11 +3323,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3337,11 +3337,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4096 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3358,11 +3358,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX9-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX9-NEXT: s_addc_u32 s1, s3, 2
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3370,10 +3370,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -3383,10 +3383,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -3398,10 +3398,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1fff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
@@ -3413,10 +3413,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1800, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1800, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -3424,11 +3424,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3438,11 +3438,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8191 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3459,11 +3459,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX9-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX9-NEXT: s_addc_u32 s1, s3, 2
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3471,10 +3471,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -3484,10 +3484,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -3499,10 +3499,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x2000
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
@@ -3514,10 +3514,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -3525,11 +3525,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3539,11 +3539,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8192 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3560,11 +3560,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX9-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3572,11 +3572,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
;
; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
@@ -3584,11 +3584,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
;
; GFX11-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX11-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
@@ -3598,11 +3598,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x7ff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
@@ -3612,12 +3612,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x7ff
-; GFX12-SDAG-NEXT: s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x7ff
+; GFX12-SDAG-NEXT: s_brev_b32 s1, 1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3634,11 +3634,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x800
-; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-NEXT: s_add_u32 s0, s2, 0x800
+; GFX9-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3646,11 +3646,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
;
; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x800
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x800
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
@@ -3658,11 +3658,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
;
; GFX11-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s0, 0x800
-; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT: s_add_u32 s0, s2, 0x800
+; GFX11-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
@@ -3672,11 +3672,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x800
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
@@ -3686,12 +3686,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x800
-; GFX12-SDAG-NEXT: s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x800
+; GFX12-SDAG-NEXT: s_brev_b32 s1, 1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3708,11 +3708,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX9-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3720,11 +3720,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
;
; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
@@ -3732,11 +3732,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
;
; GFX11-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX11-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
@@ -3746,11 +3746,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
@@ -3760,12 +3760,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_movk_i32 s2, 0xfff
-; GFX12-SDAG-NEXT: s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT: s_movk_i32 s0, 0xfff
+; GFX12-SDAG-NEXT: s_brev_b32 s1, 1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3782,11 +3782,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX9-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3794,11 +3794,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
;
; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
@@ -3806,11 +3806,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
;
; GFX11-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX11-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
@@ -3820,11 +3820,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1000
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
@@ -3834,12 +3834,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1000
-; GFX12-SDAG-NEXT: s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x1000
+; GFX12-SDAG-NEXT: s_brev_b32 s1, 1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3856,11 +3856,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX9-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3868,11 +3868,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
;
; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
@@ -3880,11 +3880,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
;
; GFX11-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX11-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
@@ -3894,11 +3894,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1fff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
@@ -3908,12 +3908,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1fff
-; GFX12-SDAG-NEXT: s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x1fff
+; GFX12-SDAG-NEXT: s_brev_b32 s1, 1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3930,11 +3930,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX9-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3942,11 +3942,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
;
; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
@@ -3954,11 +3954,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
;
; GFX11-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX11-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
@@ -3968,11 +3968,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x2000
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
@@ -3982,12 +3982,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x2000
-; GFX12-SDAG-NEXT: s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x2000
+; GFX12-SDAG-NEXT: s_brev_b32 s1, 1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
diff --git a/llvm/test/CodeGen/AMDGPU/omod.ll b/llvm/test/CodeGen/AMDGPU/omod.ll
index 769d035..48259163 100644
--- a/llvm/test/CodeGen/AMDGPU/omod.ll
+++ b/llvm/test/CodeGen/AMDGPU/omod.ll
@@ -25,15 +25,15 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac
;
; VI-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -43,30 +43,30 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac
;
; GFX11-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -101,15 +101,15 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac
;
; VI-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
@@ -119,30 +119,30 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac
;
; GFX11-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f64_e32 v[0:1], 1.0, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mul_f64_e32 v[0:1], 0.5, v[0:1]
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -177,15 +177,15 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out
;
; VI-LABEL: v_omod_div2_f32_enable_ieee_nsz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -195,30 +195,30 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out
;
; GFX11-LABEL: v_omod_div2_f32_enable_ieee_nsz:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_omod_div2_f32_enable_ieee_nsz:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -253,15 +253,15 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out
;
; VI-LABEL: v_omod_div2_f64_enable_ieee_nsz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
@@ -271,30 +271,30 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out
;
; GFX11-LABEL: v_omod_div2_f64_enable_ieee_nsz:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_omod_div2_f64_enable_ieee_nsz:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f64_e32 v[0:1], 1.0, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mul_f64_e32 v[0:1], 0.5, v[0:1]
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.ll b/llvm/test/CodeGen/AMDGPU/optimize-compare.ll
index bd7f901..5b755d0a 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.ll
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.ll
@@ -62,15 +62,15 @@ define amdgpu_kernel void @if_masked_0x80000000(i32 %arg, ptr addrspace(1) %p)
define amdgpu_kernel void @if_masked_0x8000000000000000(i64 %arg, ptr addrspace(1) %p) {
; GCN-LABEL: if_masked_0x8000000000000000:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_and_b32 s1, s1, 0x80000000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s1, s5, 0x80000000
; GCN-NEXT: s_cmp_eq_u64 s[0:1], 0
; GCN-NEXT: s_cselect_b32 s0, 22, 33
; GCN-NEXT: v_mov_b32_e32 v1, s0
-; GCN-NEXT: global_store_dword v0, v1, s[2:3]
+; GCN-NEXT: global_store_dword v0, v1, s[6:7]
; GCN-NEXT: s_endpgm
%and = and i64 %arg, 9223372036854775808
%cmp = icmp eq i64 %and, 0
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index 65f4a1b..63e9e60 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -25,21 +25,21 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
;
; GFX8-LABEL: or_v2i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s10, s2
+; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: or_v2i32:
@@ -92,24 +92,24 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
;
; GFX8-LABEL: or_v4i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s10, s2
+; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GFX8-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v3, v3, v7
; GFX8-NEXT: v_or_b32_e32 v2, v2, v6
; GFX8-NEXT: v_or_b32_e32 v1, v1, v5
; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: or_v4i32:
@@ -258,14 +258,14 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a)
;
; GFX8-LABEL: scalar_or_literal_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_or_b32 s4, s4, 0x1869f
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_or_b32 s0, s2, 0x1869f
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: scalar_or_literal_i32:
@@ -300,16 +300,16 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32
;
; GFX8-LABEL: scalar_or_literal_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_or_b32 s5, s5, 0xf237b
-; GFX8-NEXT: s_or_b32 s4, s4, 0x3039
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: s_or_b32 s0, s3, 0xf237b
+; GFX8-NEXT: s_or_b32 s1, s2, 0x3039
+; GFX8-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: scalar_or_literal_i64:
@@ -357,18 +357,18 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x74
-; GFX8-NEXT: s_movk_i32 s8, 0x3039
-; GFX8-NEXT: s_mov_b32 s9, 0xf237b
+; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x74
+; GFX8-NEXT: s_movk_i32 s0, 0x3039
+; GFX8-NEXT: s_mov_b32 s1, 0xf237b
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s0, s0, 0x3039
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s8, 0x3039
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX8-NEXT: s_addc_u32 s1, s1, 0xf237b
+; GFX8-NEXT: s_addc_u32 s1, s9, 0xf237b
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -421,15 +421,15 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x
;
; GFX8-LABEL: scalar_or_inline_imm_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_or_b32 s4, s4, 63
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: s_or_b32 s0, s2, 63
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: scalar_or_inline_imm_i64:
@@ -534,15 +534,15 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [
;
; GFX8-LABEL: scalar_or_neg_inline_imm_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s4, s[0:1], 0x4c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_mov_b32_e32 v1, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_or_b32 s4, s4, -8
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: s_or_b32 s0, s2, -8
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: scalar_or_neg_inline_imm_i64:
@@ -583,20 +583,20 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr
;
; GFX8-LABEL: vector_or_literal_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s10, s2
+; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: vector_or_literal_i32:
@@ -642,20 +642,20 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out,
;
; GFX8-LABEL: vector_or_inline_immediate_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s10, s2
+; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v0, 4, v0
-; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: vector_or_inline_immediate_i32:
@@ -886,21 +886,21 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr
;
; GFX8-LABEL: vector_or_i64_loadimm:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s10, s2
+; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v1, 0x146f, v1
; GFX8-NEXT: v_or_b32_e32 v0, 0xdf77987f, v0
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: vector_or_i64_loadimm:
@@ -949,20 +949,20 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac
;
; GFX8-LABEL: vector_or_i64_imm:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s10, s2
+; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v0, 8, v0
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: vector_or_i64_imm:
@@ -1009,21 +1009,21 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p
;
; GFX8-LABEL: vector_or_i64_neg_inline_imm:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s10, s2
+; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: v_mov_b32_e32 v1, -1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v0, -8, v0
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: vector_or_i64_neg_inline_imm:
@@ -1072,21 +1072,21 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: vector_or_i64_neg_literal:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s10, s2
+; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: v_mov_b32_e32 v1, -1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v0, 0xffffff38, v0
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: vector_or_i64_neg_literal:
@@ -1129,15 +1129,15 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32],
;
; GFX8-LABEL: trunc_i64_or_to_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s4, s[0:1], 0x4c
-; GFX8-NEXT: s_load_dword s5, s[0:1], 0x74
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c
+; GFX8-NEXT: s_load_dword s3, s[0:1], 0x74
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_or_b32 s4, s5, s4
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_or_b32 s0, s3, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: trunc_i64_or_to_i32:
@@ -1261,17 +1261,17 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c
; GFX8-LABEL: s_or_i1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s11, 0xf000
+; GFX8-NEXT: s_mov_b32 s10, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_cmp_eq_u32 s4, s5
-; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX8-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX8-NEXT: s_cmp_eq_u32 s6, s7
-; GFX8-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GFX8-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX8-NEXT: buffer_store_byte v0, off, s[8:11], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: s_or_i1:
diff --git a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
index 1899a0ab..4048994 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
@@ -299,15 +299,15 @@ define amdgpu_kernel void @fma_vector_vector_neg_vector_hi(ptr addrspace(1) %out
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -330,15 +330,15 @@ define amdgpu_kernel void @fma_vector_vector_vector_neg_hi(ptr addrspace(1) %out
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 neg_hi:[0,0,1]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -362,14 +362,14 @@ define amdgpu_kernel void @add_vector_scalar_hi(ptr addrspace(1) %out, ptr addrs
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v0, v0 offset:4
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_add_u16 v0, v1, v0 op_sel:[0,1]
-; GCN-NEXT: global_store_dword v2, v0, s[0:1]
+; GCN-NEXT: global_store_dword v2, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x i16>, ptr addrspace(3) %lds, i32 1
@@ -389,15 +389,15 @@ define amdgpu_kernel void @fma_vector_vector_scalar_hi(ptr addrspace(1) %out, pt
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -420,15 +420,15 @@ define amdgpu_kernel void @fma_vector_vector_neg_vector_lo_neg_hi(ptr addrspace(
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -453,15 +453,15 @@ define amdgpu_kernel void @fma_vector_vector_swap_vector(ptr addrspace(1) %out,
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -483,15 +483,15 @@ define amdgpu_kernel void @fma_vector_vector_swap_neg_vector(ptr addrspace(1) %o
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -514,15 +514,15 @@ define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_0(ptr addrs
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -544,15 +544,15 @@ define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_1(ptr addrs
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 neg_lo:[0,0,1]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -574,15 +574,15 @@ define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_2(ptr addrs
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 neg_hi:[0,0,1]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -604,15 +604,15 @@ define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_3(ptr addrs
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] neg_lo:[0,0,1]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -634,16 +634,16 @@ define amdgpu_kernel void @bitcast_fneg_f32(ptr addrspace(1) %out, ptr addrspace
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v0, v0
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; GCN-NEXT: v_pk_add_f16 v0, v0, v1
-; GCN-NEXT: global_store_dword v2, v0, s[0:1]
+; GCN-NEXT: global_store_dword v2, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
@@ -661,16 +661,16 @@ define amdgpu_kernel void @shuffle_bitcast_fneg_f32(ptr addrspace(1) %out, ptr a
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v0, v0
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; GCN-NEXT: v_pk_add_f16 v0, v0, v1 op_sel:[0,1] op_sel_hi:[1,0]
-; GCN-NEXT: global_store_dword v2, v0, s[0:1]
+; GCN-NEXT: global_store_dword v2, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
@@ -689,18 +689,18 @@ define amdgpu_kernel void @extract_from_i64(ptr addrspace(1) %out, ptr addrspace
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0xffff
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v2, v0
; GCN-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc
; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_and_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GCN-NEXT: v_lshl_or_b32 v0, v0, 16, v3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_add_u16 v0, v2, v0
-; GCN-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%vec0 = load volatile <2 x i16>, ptr addrspace(3) %lds, align 4
@@ -726,14 +726,14 @@ define amdgpu_kernel void @bitcast_lo_elt_op_sel(ptr addrspace(1) %out, ptr addr
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_load_ushort v3, v[0:1], off glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0]
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -764,7 +764,7 @@ define amdgpu_kernel void @mix_elt_types_op_sel(ptr addrspace(1) %out, ptr addrs
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: ; kill: killed $vgpr0_vgpr1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_load_ushort v3, v[0:1], off glc
@@ -776,7 +776,7 @@ define amdgpu_kernel void @mix_elt_types_op_sel(ptr addrspace(1) %out, ptr addrs
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll b/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll
index 4794c29..8333386 100644
--- a/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll
+++ b/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll
@@ -8,16 +8,16 @@
define amdgpu_kernel void @dbg_clause(ptr addrspace(1) %out, ptr addrspace(1) %aptr) !dbg !4 {
; GCN-LABEL: dbg_clause:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dword v1, v0, s[2:3]
+; GCN-NEXT: global_load_dword v1, v0, s[6:7]
; GCN-NEXT: ;DEBUG_VALUE: foo:a <- $vgpr1
-; GCN-NEXT: global_load_dword v2, v0, s[2:3] offset:32
+; GCN-NEXT: global_load_dword v2, v0, s[6:7] offset:32
; GCN-NEXT: ;DEBUG_VALUE: foo:b <- $vgpr2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_f32_e32 v1, v1, v2
-; GCN-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN-NEXT: global_store_dword v0, v1, s[4:5]
; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index 3f8b64b..0747760 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -519,11 +519,11 @@ define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr a
; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-2-NEXT: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100
+; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x100
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s5
; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-PRELOAD-2-NEXT: s_waitcnt vmcnt(0)
; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1
@@ -534,11 +534,11 @@ define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr a
; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-8-NEXT: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100
+; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x100
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s5
; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-PRELOAD-8-NEXT: s_waitcnt vmcnt(0)
; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1
diff --git a/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll b/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll
index dabb9d4..0c8dbd1 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll
@@ -21,7 +21,7 @@ define protected amdgpu_kernel void @load_v3i32_align8(ptr addrspace(1) %arg) #0
; GCN-LABEL: load_v3i32_align8:
; GCN: ; %bb.0:
; GCN: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x0
+; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[4:5], 0x0
%vec = load <3 x i32>, ptr addrspace(1) %arg, align 8
store <3 x i32> %vec, ptr addrspace(1) undef, align 8
ret void
@@ -52,7 +52,7 @@ define protected amdgpu_kernel void @load_v3f32_align8(ptr addrspace(1) %arg) #0
; GCN-LABEL: load_v3f32_align8:
; GCN: ; %bb.0:
; GCN: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x0
+; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[4:5], 0x0
%vec = load <3 x float>, ptr addrspace(1) %arg, align 8
store <3 x float> %vec, ptr addrspace(1) undef, align 8
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
index 2ce0b9e..a82f301 100644
--- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
@@ -110,46 +110,46 @@ define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr
define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspace(8) %b) {
; SDAG-LABEL: buffers_might_alias:
; SDAG: ; %bb.0:
-; SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0
+; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0
-; SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0
-; SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4
-; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4
+; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0
-; SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8
-; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:8
+; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:12
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0
-; SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:12
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:12
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: buffers_might_alias:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dword v0, off, s[0:3], 0
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0
-; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; GISEL-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4
+; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:4
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0
-; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4
-; GISEL-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8
+; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0
-; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8
-; GISEL-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12
+; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:8
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:12
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0
-; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:12
+; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:12
; GISEL-NEXT: s_endpgm
%l0 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 0, i32 0, i32 0)
%s0 = fmul float %l0, %l0
@@ -173,28 +173,28 @@ define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspac
define amdgpu_kernel void @independent_offsets(ptr addrspace(8) %a) {
; SDAG-LABEL: independent_offsets:
; SDAG: ; %bb.0:
-; SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SDAG-NEXT: v_mov_b32_e32 v2, 1.0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
+; SDAG-NEXT: buffer_load_dword v1, v0, s[4:7], 0 offen offset:4
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; SDAG-NEXT: buffer_store_dword v2, v0, s[4:7], 0 offen
; SDAG-NEXT: s_waitcnt vmcnt(1)
-; SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
+; SDAG-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen offset:8
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: independent_offsets:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GISEL-NEXT: v_mov_b32_e32 v2, 1.0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
+; GISEL-NEXT: buffer_load_dword v1, v0, s[4:7], 0 offen offset:4
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GISEL-NEXT: buffer_store_dword v2, v0, s[4:7], 0 offen
; GISEL-NEXT: s_waitcnt vmcnt(1)
-; GISEL-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
+; GISEL-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen offset:8
; GISEL-NEXT: s_endpgm
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%idx = shl i32 %lane, 2
diff --git a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
index 74bad5ea..5be6082 100644
--- a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
+++ b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
@@ -759,12 +759,12 @@ define amdgpu_kernel void @s_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src)
;
; VI-LABEL: s_rcp_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rcp_f32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -811,12 +811,12 @@ define amdgpu_kernel void @s_rcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float
;
; VI-LABEL: s_rcp_ulp25_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rcp_f32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -863,12 +863,12 @@ define amdgpu_kernel void @s_rcp_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, f
;
; VI-LABEL: s_rcp_fast_ulp25_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rcp_f32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -915,12 +915,12 @@ define amdgpu_kernel void @s_rcp_arcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, f
;
; VI-LABEL: s_rcp_arcp_ulp25_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rcp_f32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -967,12 +967,12 @@ define amdgpu_kernel void @s_rcp_global_fast_ulp25_pat_f32_daz(ptr addrspace(1)
;
; VI-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rcp_f32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1019,12 +1019,12 @@ define amdgpu_kernel void @s_rcp_fabs_pat_f32_daz(ptr addrspace(1) %out, float %
;
; VI-LABEL: s_rcp_fabs_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e64 v2, |s2|
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rcp_f32_e64 v2, |s4|
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1072,12 +1072,12 @@ define amdgpu_kernel void @s_neg_rcp_pat_f32_daz(ptr addrspace(1) %out, float %s
;
; VI-LABEL: s_neg_rcp_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e64 v2, -s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rcp_f32_e64 v2, -s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1127,12 +1127,12 @@ define amdgpu_kernel void @s_rcp_fabs_fneg_pat_f32_daz(ptr addrspace(1) %out, fl
;
; VI-LABEL: s_rcp_fabs_fneg_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e64 v2, -|s2|
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rcp_f32_e64 v2, -|s4|
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1188,13 +1188,13 @@ define amdgpu_kernel void @s_rcp_fabs_fneg_pat_multi_use_f32_daz(ptr addrspace(1
;
; VI-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e64 v2, -|s2|
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mul_f32_e64 v3, s2, -|s2|
+; VI-NEXT: v_rcp_f32_e64 v2, -|s4|
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mul_f32_e64 v3, s4, -|s4|
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
@@ -1254,12 +1254,12 @@ define amdgpu_kernel void @s_div_arcp_2_x_pat_f32_daz(ptr addrspace(1) %out) #0
;
; VI-LABEL: s_div_arcp_2_x_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x0
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mul_f32_e64 v2, s2, 0.5
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mul_f32_e64 v2, s4, 0.5
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1309,13 +1309,13 @@ define amdgpu_kernel void @s_div_arcp_k_x_pat_f32_daz(ptr addrspace(1) %out) #0
;
; VI-LABEL: s_div_arcp_k_x_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x0
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x3dcccccd
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mul_f32_e32 v2, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mul_f32_e32 v2, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1366,13 +1366,13 @@ define amdgpu_kernel void @s_div_arcp_neg_k_x_pat_f32_daz(ptr addrspace(1) %out)
;
; VI-LABEL: s_div_arcp_neg_k_x_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x0
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0xbdcccccd
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mul_f32_e32 v2, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mul_f32_e32 v2, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index 4a00473..9494b3c 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -112,16 +112,16 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX8-LABEL: rotl_v2i32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sub_i32 s2, 32, s6
-; GFX8-NEXT: s_sub_i32 s3, 32, s7
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_sub_i32 s0, 32, s6
+; GFX8-NEXT: s_sub_i32 s1, 32, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_alignbit_b32 v1, s5, s5, v0
; GFX8-NEXT: v_alignbit_b32 v0, s4, s4, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -143,14 +143,14 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_i32 s2, 32, s7
-; GFX11-NEXT: s_sub_i32 s3, 32, s6
-; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s2
-; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sub_i32 s0, 32, s7
+; GFX11-NEXT: s_sub_i32 s1, 32, s6
+; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s0
+; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -211,22 +211,22 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX8-LABEL: rotl_v4i32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sub_i32 s3, 32, s9
+; GFX8-NEXT: s_sub_i32 s1, 32, s9
; GFX8-NEXT: s_sub_i32 s9, 32, s11
-; GFX8-NEXT: s_sub_i32 s2, 32, s8
+; GFX8-NEXT: s_sub_i32 s0, 32, s8
; GFX8-NEXT: s_sub_i32 s8, 32, s10
; GFX8-NEXT: v_mov_b32_e32 v0, s9
; GFX8-NEXT: v_alignbit_b32 v3, s7, s7, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_alignbit_b32 v2, s6, s6, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s1
; GFX8-NEXT: v_alignbit_b32 v1, s5, s5, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_alignbit_b32 v0, s4, s4, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
@@ -252,18 +252,18 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_i32 s2, 32, s8
-; GFX11-NEXT: s_sub_i32 s3, 32, s9
+; GFX11-NEXT: s_sub_i32 s0, 32, s8
+; GFX11-NEXT: s_sub_i32 s1, 32, s9
; GFX11-NEXT: s_sub_i32 s8, 32, s11
; GFX11-NEXT: s_sub_i32 s9, 32, s10
; GFX11-NEXT: v_alignbit_b32 v3, s7, s7, s8
; GFX11-NEXT: v_alignbit_b32 v2, s6, s6, s9
-; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s3
-; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s2
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s1
+; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s0
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index d6431d7..f9da328 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -99,14 +99,14 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX8-LABEL: rotr_v2i32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s7
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_alignbit_b32 v1, s5, s5, v0
; GFX8-NEXT: v_alignbit_b32 v0, s4, s4, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -126,12 +126,12 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s7
; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s6
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -180,7 +180,7 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX8-LABEL: rotr_v4i32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s11
; GFX8-NEXT: v_mov_b32_e32 v1, s10
@@ -189,9 +189,9 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX8-NEXT: v_alignbit_b32 v2, s6, s6, v1
; GFX8-NEXT: v_alignbit_b32 v1, s5, s5, v4
; GFX8-NEXT: v_mov_b32_e32 v0, s8
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_alignbit_b32 v0, s4, s4, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
@@ -213,14 +213,14 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v3, s7, s7, s11
; GFX11-NEXT: v_alignbit_b32 v2, s6, s6, s10
; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s9
; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s8
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
index bd3c422..acacf76 100644
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -39,18 +39,18 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
; VI-LABEL: saddo_i64_zext:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s6
-; VI-NEXT: s_add_u32 s2, s6, s0
+; VI-NEXT: s_add_u32 s0, s6, s2
; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_addc_u32 s3, s7, s1
-; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0
-; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; VI-NEXT: s_addc_u32 s1, s7, s3
+; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], 0
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[1:2]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_xor_b64 s[2:3], s[8:9], vcc
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -99,19 +99,19 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s2, s6, s0
-; GFX11-NEXT: s_addc_u32 s3, s7, s1
-; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
-; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], s[6:7]
+; GFX11-NEXT: s_add_u32 s0, s6, s2
+; GFX11-NEXT: s_addc_u32 s1, s7, s3
+; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0
+; GFX11-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, s0, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: s_xor_b32 s2, s2, s3
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v0, s0, s2, v0
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -155,17 +155,17 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; VI-LABEL: s_saddo_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: s_add_i32 s4, s0, s1
-; VI-NEXT: s_cmp_lt_i32 s1, 0
-; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
-; VI-NEXT: s_cmp_lt_i32 s4, s0
+; VI-NEXT: s_add_i32 s4, s2, s3
+; VI-NEXT: s_cmp_lt_i32 s3, 0
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: s_cmp_lt_i32 s4, s2
+; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dword v[0:1], v4
@@ -208,18 +208,18 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-LABEL: s_saddo_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_i32 v0, s4, s5 clamp
-; GFX11-NEXT: s_add_i32 s4, s4, s5
+; GFX11-NEXT: v_add_nc_i32 v0, s2, s3 clamp
+; GFX11-NEXT: s_add_i32 s0, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s4, v0
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s0, v0
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b32 v1, v2, s[0:1]
-; GFX11-NEXT: global_store_b8 v1, v0, s[2:3]
+; GFX11-NEXT: global_store_b32 v1, v2, s[4:5]
+; GFX11-NEXT: global_store_b8 v1, v0, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -264,18 +264,18 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_saddo_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: flat_load_dword v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: flat_load_dword v5, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v6, vcc, v4, v5
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v5
@@ -288,45 +288,45 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_saddo_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_i32 v3, v1, v2 clamp
; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_saddo_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX10-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX10-NEXT: global_load_dword v2, v0, s[10:11]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_i32 v3, v1, v2 clamp
; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX10-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT: global_store_byte v0, v2, s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_saddo_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
-; GFX11-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT: global_load_b32 v1, v0, s[8:9]
+; GFX11-NEXT: global_load_b32 v2, v0, s[10:11]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_nc_i32 v3, v1, v2 clamp
; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v2
@@ -334,8 +334,8 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT: global_store_b8 v0, v2, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -379,21 +379,21 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: s_saddo_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_u32 s0, s4, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_addc_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_add_u32 s0, s8, s10
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: s_addc_u32 s1, s9, s11
+; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
-; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
+; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[10:11], 0
; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; VI-NEXT: flat_store_byte v[2:3], v0
@@ -401,56 +401,56 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: s_saddo_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s8, s4, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_addc_u32 s9, s5, s7
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[10:11], s[6:7], 0
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
+; GFX9-NEXT: s_add_u32 s0, s8, s10
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], vcc
+; GFX9-NEXT: s_addc_u32 s1, s9, s11
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[10:11], 0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX9-NEXT: global_store_byte v2, v0, s[2:3]
+; GFX9-NEXT: global_store_byte v2, v0, s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_saddo_i64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s8, s4, s6
-; GFX10-NEXT: s_addc_u32 s9, s5, s7
-; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0
-; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v0, s8
-; GFX10-NEXT: v_mov_b32_e32 v1, s9
-; GFX10-NEXT: s_xor_b32 s4, s6, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX10-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX10-NEXT: s_add_u32 s0, s8, s10
+; GFX10-NEXT: s_addc_u32 s1, s9, s11
+; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0
+; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[8:9]
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: s_xor_b32 s0, s2, s3
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT: global_store_byte v2, v3, s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_saddo_i64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s8, s4, s6
-; GFX11-NEXT: s_addc_u32 s9, s5, s7
-; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0
-; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5]
-; GFX11-NEXT: v_mov_b32_e32 v0, s8
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX11-NEXT: s_add_u32 s0, s8, s10
+; GFX11-NEXT: s_addc_u32 s1, s9, s11
+; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0
+; GFX11-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[8:9]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s4, s6, s4
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: s_xor_b32 s0, s2, s3
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: global_store_b8 v2, v3, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -496,18 +496,18 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_saddo_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s2
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v6, s6
+; VI-NEXT: v_mov_b32_e32 v7, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v9, vcc, v1, v3, vcc
@@ -627,18 +627,18 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; VI-LABEL: v_saddo_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s2
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v6, s6
+; VI-NEXT: v_mov_b32_e32 v7, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v9, vcc, v1, v3
; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2
@@ -656,11 +656,11 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX9-LABEL: v_saddo_v2i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[4:5]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v5, v1, v3
; GFX9-NEXT: v_add_i32 v1, v1, v3 clamp
@@ -670,18 +670,18 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1]
-; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[4:5]
+; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_saddo_v2i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v5, s[4:5]
-; GFX10-NEXT: global_load_dwordx2 v[2:3], v5, s[6:7]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v5, s[8:9]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v5, s[10:11]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v4, v1, v3
; GFX10-NEXT: v_add_nc_i32 v1, v1, v3 clamp
@@ -691,18 +691,18 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT: global_store_dwordx2 v5, v[3:4], s[0:1]
-; GFX10-NEXT: global_store_dwordx2 v5, v[0:1], s[2:3]
+; GFX10-NEXT: global_store_dwordx2 v5, v[3:4], s[4:5]
+; GFX10-NEXT: global_store_dwordx2 v5, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_saddo_v2i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v5, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b64 v[0:1], v5, s[4:5]
-; GFX11-NEXT: global_load_b64 v[2:3], v5, s[6:7]
+; GFX11-NEXT: global_load_b64 v[0:1], v5, s[8:9]
+; GFX11-NEXT: global_load_b64 v[2:3], v5, s[10:11]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_nc_u32_e32 v4, v1, v3
; GFX11-NEXT: v_add_nc_i32 v1, v1, v3 clamp
@@ -714,8 +714,8 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v5, v[3:4], s[0:1]
-; GFX11-NEXT: global_store_b64 v5, v[0:1], s[2:3]
+; GFX11-NEXT: global_store_b64 v5, v[3:4], s[4:5]
+; GFX11-NEXT: global_store_b64 v5, v[0:1], s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
index 5260a48..ae1b191 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -26,22 +26,22 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add
;
; VI-LABEL: scalar_to_vector_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16
; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
%tmp1 = load i32, ptr addrspace(1) %in, align 4
%bc = bitcast i32 %tmp1 to <2 x i16>
@@ -73,22 +73,22 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add
;
; VI-LABEL: scalar_to_vector_v2f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16
; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
%tmp1 = load float, ptr addrspace(1) %in, align 4
%bc = bitcast float %tmp1 to <2 x i16>
@@ -230,13 +230,13 @@ define amdgpu_kernel void @scalar_to_vector_test6(ptr addrspace(1) %out, i8 zero
;
; VI-LABEL: scalar_to_vector_test6:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0
%bc = bitcast <4 x i8> %newvec0 to <2 x half>
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index 6372d74..ef8e194 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -220,44 +220,44 @@ define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; TONGA-LABEL: sdiv_i32_4:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; TONGA-NEXT: s_mov_b32 s7, 0xf000
-; TONGA-NEXT: s_mov_b32 s6, -1
-; TONGA-NEXT: s_mov_b32 s10, s6
-; TONGA-NEXT: s_mov_b32 s11, s7
+; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; TONGA-NEXT: s_mov_b32 s3, 0xf000
+; TONGA-NEXT: s_mov_b32 s2, -1
+; TONGA-NEXT: s_mov_b32 s10, s2
+; TONGA-NEXT: s_mov_b32 s11, s3
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s8, s2
-; TONGA-NEXT: s_mov_b32 s9, s3
+; TONGA-NEXT: s_mov_b32 s8, s6
+; TONGA-NEXT: s_mov_b32 s9, s7
; TONGA-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; TONGA-NEXT: s_mov_b32 s4, s0
-; TONGA-NEXT: s_mov_b32 s5, s1
+; TONGA-NEXT: s_mov_b32 s0, s4
+; TONGA-NEXT: s_mov_b32 s1, s5
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; TONGA-NEXT: v_lshrrev_b32_e32 v1, 30, v1
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0
-; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_i32_4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 30, v1
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: sdiv_i32_4:
@@ -316,48 +316,48 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa
;
; TONGA-LABEL: slow_sdiv_i32_3435:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; TONGA-NEXT: s_mov_b32 s7, 0xf000
-; TONGA-NEXT: s_mov_b32 s6, -1
-; TONGA-NEXT: s_mov_b32 s10, s6
-; TONGA-NEXT: s_mov_b32 s11, s7
+; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; TONGA-NEXT: s_mov_b32 s3, 0xf000
+; TONGA-NEXT: s_mov_b32 s2, -1
+; TONGA-NEXT: s_mov_b32 s10, s2
+; TONGA-NEXT: s_mov_b32 s11, s3
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s8, s2
-; TONGA-NEXT: s_mov_b32 s9, s3
+; TONGA-NEXT: s_mov_b32 s8, s6
+; TONGA-NEXT: s_mov_b32 s9, s7
; TONGA-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; TONGA-NEXT: s_mov_b32 s2, 0x98a1930b
-; TONGA-NEXT: s_mov_b32 s4, s0
-; TONGA-NEXT: s_mov_b32 s5, s1
+; TONGA-NEXT: s_mov_b32 s0, 0x98a1930b
+; TONGA-NEXT: s_mov_b32 s1, s5
; TONGA-NEXT: s_waitcnt vmcnt(0)
-; TONGA-NEXT: v_mul_hi_i32 v1, v0, s2
+; TONGA-NEXT: v_mul_hi_i32 v1, v0, s0
+; TONGA-NEXT: s_mov_b32 s0, s4
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; TONGA-NEXT: v_lshrrev_b32_e32 v1, 31, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v0, 11, v0
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1
-; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: slow_sdiv_i32_3435:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s2, 0x98a1930b
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, 0x98a1930b
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mul_hi_i32 v1, v0, s2
+; GFX9-NEXT: v_mul_hi_i32 v1, v0, s0
+; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: v_add_u32_e32 v0, v1, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 31, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 11, v0
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: slow_sdiv_i32_3435:
@@ -462,17 +462,17 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; TONGA-LABEL: sdiv_v2i32:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; TONGA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24
; TONGA-NEXT: s_mov_b32 s7, 0xf000
; TONGA-NEXT: s_mov_b32 s6, -1
-; TONGA-NEXT: s_mov_b32 s10, s6
-; TONGA-NEXT: s_mov_b32 s11, s7
+; TONGA-NEXT: s_mov_b32 s2, s6
+; TONGA-NEXT: s_mov_b32 s3, s7
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s8, s2
-; TONGA-NEXT: s_mov_b32 s9, s3
-; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; TONGA-NEXT: s_mov_b32 s4, s0
-; TONGA-NEXT: s_mov_b32 s5, s1
+; TONGA-NEXT: s_mov_b32 s0, s10
+; TONGA-NEXT: s_mov_b32 s1, s11
+; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; TONGA-NEXT: s_mov_b32 s4, s8
+; TONGA-NEXT: s_mov_b32 s5, s9
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v2
; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v3
@@ -707,17 +707,17 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1)
;
; TONGA-LABEL: sdiv_v2i32_4:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; TONGA-NEXT: s_mov_b32 s7, 0xf000
-; TONGA-NEXT: s_mov_b32 s6, -1
-; TONGA-NEXT: s_mov_b32 s10, s6
-; TONGA-NEXT: s_mov_b32 s11, s7
+; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; TONGA-NEXT: s_mov_b32 s3, 0xf000
+; TONGA-NEXT: s_mov_b32 s2, -1
+; TONGA-NEXT: s_mov_b32 s10, s2
+; TONGA-NEXT: s_mov_b32 s11, s3
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s8, s2
-; TONGA-NEXT: s_mov_b32 s9, s3
+; TONGA-NEXT: s_mov_b32 s8, s6
+; TONGA-NEXT: s_mov_b32 s9, s7
; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; TONGA-NEXT: s_mov_b32 s4, s0
-; TONGA-NEXT: s_mov_b32 s5, s1
+; TONGA-NEXT: s_mov_b32 s0, s4
+; TONGA-NEXT: s_mov_b32 s1, s5
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v1
@@ -727,22 +727,22 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1)
; TONGA-NEXT: v_add_u32_e32 v1, vcc, v3, v1
; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v1, 2, v1
-; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_v2i32_4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v1
@@ -752,7 +752,7 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1)
; GFX9-NEXT: v_add_u32_e32 v1, v1, v3
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 2, v1
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: sdiv_v2i32_4:
@@ -918,18 +918,18 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; TONGA-LABEL: sdiv_v4i32:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; TONGA-NEXT: s_mov_b32 s11, 0xf000
; TONGA-NEXT: s_mov_b32 s10, -1
-; TONGA-NEXT: s_mov_b32 s6, s10
-; TONGA-NEXT: s_mov_b32 s7, s11
+; TONGA-NEXT: s_mov_b32 s2, s10
+; TONGA-NEXT: s_mov_b32 s3, s11
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s4, s2
-; TONGA-NEXT: s_mov_b32 s5, s3
-; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; TONGA-NEXT: s_mov_b32 s8, s0
-; TONGA-NEXT: s_mov_b32 s9, s1
+; TONGA-NEXT: s_mov_b32 s0, s6
+; TONGA-NEXT: s_mov_b32 s1, s7
+; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; TONGA-NEXT: s_mov_b32 s8, s4
+; TONGA-NEXT: s_mov_b32 s9, s5
; TONGA-NEXT: s_waitcnt vmcnt(1)
; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v1
; TONGA-NEXT: s_waitcnt vmcnt(0)
@@ -1371,17 +1371,17 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX9-LABEL: sdiv_v4i32_4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v1
@@ -1399,7 +1399,7 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1)
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 2, v1
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 2, v2
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 2, v3
-; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: sdiv_v4i32_4:
@@ -1482,18 +1482,18 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; TONGA-LABEL: v_sdiv_i8:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; TONGA-NEXT: s_mov_b32 s7, 0xf000
-; TONGA-NEXT: s_mov_b32 s6, -1
-; TONGA-NEXT: s_mov_b32 s10, s6
-; TONGA-NEXT: s_mov_b32 s11, s7
+; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; TONGA-NEXT: s_mov_b32 s3, 0xf000
+; TONGA-NEXT: s_mov_b32 s2, -1
+; TONGA-NEXT: s_mov_b32 s10, s2
+; TONGA-NEXT: s_mov_b32 s11, s3
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s8, s2
-; TONGA-NEXT: s_mov_b32 s9, s3
+; TONGA-NEXT: s_mov_b32 s8, s6
+; TONGA-NEXT: s_mov_b32 s9, s7
; TONGA-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:1
; TONGA-NEXT: buffer_load_sbyte v1, off, s[8:11], 0
-; TONGA-NEXT: s_mov_b32 s4, s0
-; TONGA-NEXT: s_mov_b32 s5, s1
+; TONGA-NEXT: s_mov_b32 s0, s4
+; TONGA-NEXT: s_mov_b32 s1, s5
; TONGA-NEXT: s_waitcnt vmcnt(1)
; TONGA-NEXT: v_cvt_f32_i32_e32 v2, v0
; TONGA-NEXT: s_waitcnt vmcnt(0)
@@ -1510,23 +1510,23 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1
; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 8
-; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: v_sdiv_i8:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:1
; GFX9-NEXT: buffer_load_sbyte v1, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -1543,7 +1543,7 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GFX9-NEXT: v_add_u32_e32 v0, v4, v0
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v_sdiv_i8:
@@ -2221,21 +2221,21 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read
;
; TONGA-LABEL: scalarize_mulhs_4xi32:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; TONGA-NEXT: s_mov_b32 s7, 0xf000
-; TONGA-NEXT: s_mov_b32 s6, -1
+; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; TONGA-NEXT: s_mov_b32 s3, 0xf000
+; TONGA-NEXT: s_mov_b32 s2, -1
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s4, s0
-; TONGA-NEXT: s_mov_b32 s5, s1
-; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; TONGA-NEXT: s_mov_b32 s0, 0x1389c755
-; TONGA-NEXT: s_mov_b32 s4, s2
-; TONGA-NEXT: s_mov_b32 s5, s3
+; TONGA-NEXT: s_mov_b32 s0, s4
+; TONGA-NEXT: s_mov_b32 s1, s5
+; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; TONGA-NEXT: s_mov_b32 s4, 0x1389c755
+; TONGA-NEXT: s_mov_b32 s0, s6
+; TONGA-NEXT: s_mov_b32 s1, s7
; TONGA-NEXT: s_waitcnt vmcnt(0)
-; TONGA-NEXT: v_mul_hi_i32 v0, v0, s0
-; TONGA-NEXT: v_mul_hi_i32 v1, v1, s0
-; TONGA-NEXT: v_mul_hi_i32 v2, v2, s0
-; TONGA-NEXT: v_mul_hi_i32 v3, v3, s0
+; TONGA-NEXT: v_mul_hi_i32 v0, v0, s4
+; TONGA-NEXT: v_mul_hi_i32 v1, v1, s4
+; TONGA-NEXT: v_mul_hi_i32 v2, v2, s4
+; TONGA-NEXT: v_mul_hi_i32 v3, v3, s4
; TONGA-NEXT: v_lshrrev_b32_e32 v4, 31, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v0, 12, v0
; TONGA-NEXT: v_lshrrev_b32_e32 v5, 31, v1
@@ -2248,26 +2248,26 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read
; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5
; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7
-; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: scalarize_mulhs_4xi32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
-; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GFX9-NEXT: s_mov_b32 s0, 0x1389c755
-; GFX9-NEXT: s_mov_b32 s4, s2
-; GFX9-NEXT: s_mov_b32 s5, s3
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT: s_mov_b32 s4, 0x1389c755
+; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s1, s7
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mul_hi_i32 v0, v0, s0
-; GFX9-NEXT: v_mul_hi_i32 v1, v1, s0
-; GFX9-NEXT: v_mul_hi_i32 v2, v2, s0
-; GFX9-NEXT: v_mul_hi_i32 v3, v3, s0
+; GFX9-NEXT: v_mul_hi_i32 v0, v0, s4
+; GFX9-NEXT: v_mul_hi_i32 v1, v1, s4
+; GFX9-NEXT: v_mul_hi_i32 v2, v2, s4
+; GFX9-NEXT: v_mul_hi_i32 v3, v3, s4
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 12, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 31, v1
@@ -2280,7 +2280,7 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read
; GFX9-NEXT: v_add_u32_e32 v1, v1, v5
; GFX9-NEXT: v_add_u32_e32 v2, v2, v6
; GFX9-NEXT: v_add_u32_e32 v3, v3, v7
-; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: scalarize_mulhs_4xi32:
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 0f2eedb..b271a03 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -7,13 +7,13 @@
define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; NOSDWA-LABEL: add_shr_i32:
; NOSDWA: ; %bb.0:
-; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s2
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s3
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: flat_load_dword v2, v[0:1]
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
; NOSDWA-NEXT: s_waitcnt vmcnt(0)
; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, v3, v2
@@ -22,13 +22,13 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX89-LABEL: add_shr_i32:
; GFX89: ; %bb.0:
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: v_mov_b32_e32 v0, s2
-; GFX89-NEXT: v_mov_b32_e32 v1, s3
+; GFX89-NEXT: v_mov_b32_e32 v0, s6
+; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: flat_load_dword v2, v[0:1]
-; GFX89-NEXT: v_mov_b32_e32 v0, s0
-; GFX89-NEXT: v_mov_b32_e32 v1, s1
+; GFX89-NEXT: v_mov_b32_e32 v0, s4
+; GFX89-NEXT: v_mov_b32_e32 v1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_add_u32_sdwa v2, vcc, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX89-NEXT: flat_store_dword v[0:1], v2
@@ -36,24 +36,24 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: add_shr_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: add_shr_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
%a = load i32, ptr addrspace(1) %in, align 4
%shr = lshr i32 %a, 16
@@ -65,13 +65,13 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; NOSDWA-LABEL: sub_shr_i32:
; NOSDWA: ; %bb.0:
-; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s2
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s3
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: flat_load_dword v2, v[0:1]
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
; NOSDWA-NEXT: s_waitcnt vmcnt(0)
; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; NOSDWA-NEXT: v_sub_u32_e32 v2, vcc, v3, v2
@@ -80,13 +80,13 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX89-LABEL: sub_shr_i32:
; GFX89: ; %bb.0:
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: v_mov_b32_e32 v0, s2
-; GFX89-NEXT: v_mov_b32_e32 v1, s3
+; GFX89-NEXT: v_mov_b32_e32 v0, s6
+; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: flat_load_dword v2, v[0:1]
-; GFX89-NEXT: v_mov_b32_e32 v0, s0
-; GFX89-NEXT: v_mov_b32_e32 v1, s1
+; GFX89-NEXT: v_mov_b32_e32 v0, s4
+; GFX89-NEXT: v_mov_b32_e32 v1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_sub_u32_sdwa v2, vcc, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX89-NEXT: flat_store_dword v[0:1], v2
@@ -94,24 +94,24 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: sub_shr_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sub_shr_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
%a = load i32, ptr addrspace(1) %in, align 4
%shr = lshr i32 %a, 16
@@ -124,14 +124,14 @@ define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; NOSDWA-LABEL: mul_shr_i32:
; NOSDWA: ; %bb.0:
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
-; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_dword v4, v[0:1]
; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
@@ -148,14 +148,14 @@ define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX89-LABEL: mul_shr_i32:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_dword v4, v[0:1]
; GFX89-NEXT: flat_load_dword v2, v[2:3]
@@ -211,14 +211,14 @@ define amdgpu_kernel void @mul_i16(ptr addrspace(1) %out, ptr addrspace(1) %ina,
; NOSDWA-LABEL: mul_i16:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
-; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_ushort v4, v[0:1]
; NOSDWA-NEXT: flat_load_ushort v2, v[2:3]
@@ -232,14 +232,14 @@ define amdgpu_kernel void @mul_i16(ptr addrspace(1) %out, ptr addrspace(1) %ina,
; GFX89-LABEL: mul_i16:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_ushort v4, v[0:1]
; GFX89-NEXT: flat_load_ushort v2, v[2:3]
@@ -294,14 +294,14 @@ define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in
; NOSDWA-LABEL: mul_v2i16:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
-; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_dword v4, v[0:1]
; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
@@ -320,14 +320,14 @@ define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in
; GFX89-LABEL: mul_v2i16:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_dword v4, v[0:1]
; GFX89-NEXT: flat_load_dword v2, v[2:3]
@@ -384,14 +384,14 @@ define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in
; NOSDWA-LABEL: mul_v4i16:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
-; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; NOSDWA-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -416,14 +416,14 @@ define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in
; GFX89-LABEL: mul_v4i16:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX89-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -485,14 +485,14 @@ define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in
; NOSDWA-LABEL: mul_v8i16:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 4, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
-; NOSDWA-NEXT: v_add_u32_e32 v4, vcc, s0, v2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_add_u32_e32 v4, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; NOSDWA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -529,14 +529,14 @@ define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in
; GFX89-LABEL: mul_v8i16:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 4, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v4, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v4, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX89-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -608,12 +608,12 @@ define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina
; NOSDWA-LABEL: mul_half:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
-; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v2, s2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
; NOSDWA-NEXT: flat_load_ushort v4, v[0:1]
; NOSDWA-NEXT: flat_load_ushort v2, v[2:3]
; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
@@ -626,12 +626,12 @@ define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina
; GFX89-LABEL: mul_half:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v0, s6
; GFX89-NEXT: v_mov_b32_e32 v1, s7
-; GFX89-NEXT: v_mov_b32_e32 v2, s0
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
+; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
; GFX89-NEXT: flat_load_ushort v4, v[0:1]
; GFX89-NEXT: flat_load_ushort v2, v[2:3]
; GFX89-NEXT: v_mov_b32_e32 v0, s4
@@ -680,11 +680,11 @@ define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i
; NOSDWA-LABEL: mul_v2half:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
-; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v2, s2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
; NOSDWA-NEXT: flat_load_dword v3, v[0:1]
@@ -704,12 +704,12 @@ define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX89-LABEL: mul_v2half:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v0, s6
; GFX89-NEXT: v_mov_b32_e32 v1, s7
-; GFX89-NEXT: v_mov_b32_e32 v2, s0
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
+; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
; GFX89-NEXT: flat_load_dword v4, v[0:1]
; GFX89-NEXT: flat_load_dword v2, v[2:3]
; GFX89-NEXT: v_mov_b32_e32 v0, s4
@@ -760,12 +760,12 @@ define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %i
; NOSDWA-LABEL: mul_v4half:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
-; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v2, s2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
; NOSDWA-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; NOSDWA-NEXT: v_mov_b32_e32 v4, s4
@@ -790,12 +790,12 @@ define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX89-LABEL: mul_v4half:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v0, s6
; GFX89-NEXT: v_mov_b32_e32 v1, s7
-; GFX89-NEXT: v_mov_b32_e32 v2, s0
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
+; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
; GFX89-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX89-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; GFX89-NEXT: v_mov_b32_e32 v4, s4
@@ -851,12 +851,12 @@ define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %i
; NOSDWA-LABEL: mul_v8half:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v4, s6
; NOSDWA-NEXT: v_mov_b32_e32 v5, s7
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s2
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s3
; NOSDWA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; NOSDWA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; NOSDWA-NEXT: v_mov_b32_e32 v8, s4
@@ -893,12 +893,12 @@ define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX89-LABEL: mul_v8half:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v0, s6
; GFX89-NEXT: v_mov_b32_e32 v1, s7
-; GFX89-NEXT: v_mov_b32_e32 v4, s0
-; GFX89-NEXT: v_mov_b32_e32 v5, s1
+; GFX89-NEXT: v_mov_b32_e32 v4, s2
+; GFX89-NEXT: v_mov_b32_e32 v5, s3
; GFX89-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX89-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GFX89-NEXT: v_mov_b32_e32 v8, s4
@@ -964,13 +964,13 @@ define amdgpu_kernel void @mul_i8(ptr addrspace(1) %out, ptr addrspace(1) %ina,
; NOSDWA-LABEL: mul_i8:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v2, s7
; NOSDWA-NEXT: v_add_u32_e32 v1, vcc, s6, v0
; NOSDWA-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v4, s1
-; NOSDWA-NEXT: v_add_u32_e32 v3, vcc, s0, v0
+; NOSDWA-NEXT: v_mov_b32_e32 v4, s3
+; NOSDWA-NEXT: v_add_u32_e32 v3, vcc, s2, v0
; NOSDWA-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; NOSDWA-NEXT: flat_load_ubyte v2, v[1:2]
; NOSDWA-NEXT: flat_load_ubyte v3, v[3:4]
@@ -984,13 +984,13 @@ define amdgpu_kernel void @mul_i8(ptr addrspace(1) %out, ptr addrspace(1) %ina,
; GFX89-LABEL: mul_i8:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v2, s7
; GFX89-NEXT: v_add_u32_e32 v1, vcc, s6, v0
; GFX89-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GFX89-NEXT: v_mov_b32_e32 v4, s1
-; GFX89-NEXT: v_add_u32_e32 v3, vcc, s0, v0
+; GFX89-NEXT: v_mov_b32_e32 v4, s3
+; GFX89-NEXT: v_add_u32_e32 v3, vcc, s2, v0
; GFX89-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; GFX89-NEXT: flat_load_ubyte v2, v[1:2]
; GFX89-NEXT: flat_load_ubyte v3, v[3:4]
@@ -1043,14 +1043,14 @@ define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina
; NOSDWA-LABEL: mul_v2i8:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
-; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_ushort v4, v[0:1]
; NOSDWA-NEXT: flat_load_ushort v2, v[2:3]
@@ -1071,14 +1071,14 @@ define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina
; GFX89-LABEL: mul_v2i8:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_ushort v4, v[0:1]
; GFX89-NEXT: flat_load_ushort v2, v[2:3]
@@ -1143,15 +1143,15 @@ define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina
; NOSDWA-LABEL: mul_v4i8:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
; NOSDWA-NEXT: flat_load_dword v4, v[0:1]
-; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_dword v2, v[0:1]
; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
@@ -1183,14 +1183,14 @@ define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina
; GFX89-LABEL: mul_v4i8:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_dword v4, v[0:1]
; GFX89-NEXT: flat_load_dword v2, v[2:3]
@@ -1272,14 +1272,14 @@ define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina
; NOSDWA-LABEL: mul_v8i8:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
-; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; NOSDWA-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -1331,14 +1331,14 @@ define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina
; GFX89-LABEL: mul_v8i8:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX89-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -1449,13 +1449,13 @@ entry:
define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
; NOSDWA-LABEL: sitofp_v2i16_to_v2f16:
; NOSDWA: ; %bb.0: ; %entry
-; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s2
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s3
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: flat_load_dword v2, v[0:1]
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
; NOSDWA-NEXT: s_waitcnt vmcnt(0)
; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; NOSDWA-NEXT: v_cvt_f16_i16_e32 v3, v3
@@ -1467,13 +1467,13 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
;
; GFX89-LABEL: sitofp_v2i16_to_v2f16:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: v_mov_b32_e32 v0, s2
-; GFX89-NEXT: v_mov_b32_e32 v1, s3
+; GFX89-NEXT: v_mov_b32_e32 v0, s6
+; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: flat_load_dword v2, v[0:1]
-; GFX89-NEXT: v_mov_b32_e32 v0, s0
-; GFX89-NEXT: v_mov_b32_e32 v1, s1
+; GFX89-NEXT: v_mov_b32_e32 v0, s4
+; GFX89-NEXT: v_mov_b32_e32 v1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f16_i16_sdwa v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX89-NEXT: v_cvt_f16_i16_e32 v2, v2
@@ -1483,29 +1483,29 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
;
; GFX9-LABEL: sitofp_v2i16_to_v2f16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_f16_i16_e32 v2, v1
; GFX9-NEXT: v_cvt_f16_i16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sitofp_v2i16_to_v2f16:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cvt_f16_i16_e32 v2, v1
; GFX10-NEXT: v_cvt_f16_i16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) #0 {
@@ -1520,11 +1520,11 @@ define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i
; NOSDWA-LABEL: mac_v2half:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
-; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v2, s2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
; NOSDWA-NEXT: flat_load_dword v3, v[0:1]
@@ -1544,11 +1544,11 @@ define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX89-LABEL: mac_v2half:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v0, s6
-; GFX89-NEXT: v_mov_b32_e32 v2, s0
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
+; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: flat_load_dword v2, v[2:3]
; GFX89-NEXT: flat_load_dword v3, v[0:1]
@@ -1605,15 +1605,15 @@ entry:
define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; NOSDWA-LABEL: immediate_mul_v2i16:
; NOSDWA: ; %bb.0: ; %entry
-; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; NOSDWA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s3
-; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
+; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; NOSDWA-NEXT: flat_load_dword v2, v[0:1]
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
; NOSDWA-NEXT: s_waitcnt vmcnt(0)
; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, 0x7b, v2
; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2
@@ -1625,16 +1625,16 @@ define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrsp
;
; GFX89-LABEL: immediate_mul_v2i16:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX89-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX89-NEXT: v_mov_b32_e32 v3, 0x141
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: v_mov_b32_e32 v1, s3
-; GFX89-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX89-NEXT: v_mov_b32_e32 v1, s7
+; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX89-NEXT: flat_load_dword v2, v[0:1]
-; GFX89-NEXT: v_mov_b32_e32 v0, s0
-; GFX89-NEXT: v_mov_b32_e32 v1, s1
+; GFX89-NEXT: v_mov_b32_e32 v0, s4
+; GFX89-NEXT: v_mov_b32_e32 v1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_mul_lo_u16_e32 v4, 0x7b, v2
; GFX89-NEXT: v_mul_lo_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -1644,27 +1644,27 @@ define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: immediate_mul_v2i16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_mov_b32 s0, 0x141007b
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
-; GFX9-NEXT: s_mov_b32 s2, 0x141007b
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, s2
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, s0
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: immediate_mul_v2i16:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_mul_lo_u16 v0, 0x141007b, v0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1680,14 +1680,14 @@ define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
; NOSDWA-LABEL: mulmul_v2i16:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
-; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
; NOSDWA-NEXT: flat_load_dword v3, v[0:1]
@@ -1709,14 +1709,14 @@ define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX89-LABEL: mulmul_v2i16:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_dword v4, v[0:1]
; GFX89-NEXT: flat_load_dword v2, v[2:3]
@@ -1778,12 +1778,12 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
; NOSDWA-LABEL: add_bb_v2i16:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
-; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v2, s2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
; NOSDWA-NEXT: flat_load_dword v1, v[0:1]
; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
@@ -1803,12 +1803,12 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX89-LABEL: add_bb_v2i16:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v0, s6
; GFX89-NEXT: v_mov_b32_e32 v1, s7
-; GFX89-NEXT: v_mov_b32_e32 v2, s0
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
+; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
; GFX89-NEXT: flat_load_dword v1, v[0:1]
; GFX89-NEXT: flat_load_dword v2, v[2:3]
; GFX89-NEXT: v_mov_b32_e32 v0, s4
@@ -1863,13 +1863,13 @@ store_label:
define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrspace(1) %destValues) #0 {
; NOSDWA-LABEL: pulled_out_test:
; NOSDWA: ; %bb.0: ; %entry
-; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; NOSDWA-NEXT: v_mov_b32_e32 v2, s2
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_mov_b32_e32 v2, s6
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s7
; NOSDWA-NEXT: s_waitcnt vmcnt(0)
; NOSDWA-NEXT: v_and_b32_e32 v4, 0xff, v0
; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 8, v0
@@ -1900,15 +1900,15 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
;
; GFX89-LABEL: pulled_out_test:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX89-NEXT: v_mov_b32_e32 v4, 8
; GFX89-NEXT: v_mov_b32_e32 v5, 0xff
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: v_mov_b32_e32 v0, s0
-; GFX89-NEXT: v_mov_b32_e32 v1, s1
+; GFX89-NEXT: v_mov_b32_e32 v0, s4
+; GFX89-NEXT: v_mov_b32_e32 v1, s5
; GFX89-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX89-NEXT: v_mov_b32_e32 v2, s2
-; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_mov_b32_e32 v2, s6
+; GFX89-NEXT: v_mov_b32_e32 v3, s7
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_lshrrev_b32_sdwa v6, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX89-NEXT: v_lshrrev_b32_e32 v7, 24, v0
@@ -1929,12 +1929,12 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
;
; GFX9-LABEL: pulled_out_test:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_mov_b32_e32 v3, 8
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
; GFX9-NEXT: s_movk_i32 s0, 0xff
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_sdwa v4, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0
@@ -1950,18 +1950,18 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: pulled_out_test:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 8
; GFX10-NEXT: v_mov_b32_e32 v4, 24
; GFX10-NEXT: v_mov_b32_e32 v5, 0xff
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_sdwa v6, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_sdwa v7, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -1975,7 +1975,7 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
; GFX10-NEXT: v_or_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
entry:
%idxprom = ashr exact i64 15, 32
@@ -2207,11 +2207,11 @@ define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addr
; NOSDWA-LABEL: mac_v2half_same_srcop:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
-; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v2, s2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
; NOSDWA-NEXT: flat_load_dword v3, v[0:1]
@@ -2231,12 +2231,12 @@ define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addr
; GFX89-LABEL: mac_v2half_same_srcop:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v0, s6
; GFX89-NEXT: v_mov_b32_e32 v1, s7
-; GFX89-NEXT: v_mov_b32_e32 v2, s0
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
+; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
; GFX89-NEXT: flat_load_dword v4, v[0:1]
; GFX89-NEXT: flat_load_dword v2, v[2:3]
; GFX89-NEXT: v_mov_b32_e32 v0, s4
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index 0992e9e3..53b78bd 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -85,17 +85,17 @@ define amdgpu_kernel void @select_f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
-; GFX11-NEXT: s_mov_b32 s14, -1
-; GFX11-NEXT: s_mov_b32 s15, 0x31016000
-; GFX11-NEXT: s_mov_b32 s18, s14
-; GFX11-NEXT: s_mov_b32 s19, s15
-; GFX11-NEXT: s_mov_b32 s22, s14
-; GFX11-NEXT: s_mov_b32 s23, s15
-; GFX11-NEXT: s_mov_b32 s26, s14
-; GFX11-NEXT: s_mov_b32 s27, s15
-; GFX11-NEXT: s_mov_b32 s2, s14
-; GFX11-NEXT: s_mov_b32 s3, s15
+; GFX11-NEXT: s_load_b64 s[12:13], s[0:1], 0x44
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
+; GFX11-NEXT: s_mov_b32 s26, s2
+; GFX11-NEXT: s_mov_b32 s27, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s16, s6
; GFX11-NEXT: s_mov_b32 s17, s7
@@ -109,13 +109,13 @@ define amdgpu_kernel void @select_f16(
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[24:27], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v3, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v3, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s4
-; GFX11-NEXT: s_mov_b32 s13, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo
-; GFX11-NEXT: buffer_store_b16 v0, off, s[12:15], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -174,64 +174,64 @@ define amdgpu_kernel void @select_f16_imm_a(
;
; VI-LABEL: select_f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -287,64 +287,64 @@ define amdgpu_kernel void @select_f16_imm_b(
;
; VI-LABEL: select_f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -400,65 +400,65 @@ define amdgpu_kernel void @select_f16_imm_c(
;
; VI-LABEL: select_f16_imm_c:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_f16_imm_c:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -514,65 +514,65 @@ define amdgpu_kernel void @select_f16_imm_d(
;
; VI-LABEL: select_f16_imm_d:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_f16_imm_d:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -801,28 +801,28 @@ define amdgpu_kernel void @select_v2f16_imm_a(
;
; VI-LABEL: select_v2f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
-; VI-NEXT: s_movk_i32 s2, 0x3900
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0
+; VI-NEXT: s_movk_i32 s6, 0x3900
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0
@@ -830,36 +830,36 @@ define amdgpu_kernel void @select_v2f16_imm_a(
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_cmp_lt_f16_e32 vcc, s2, v3
+; VI-NEXT: v_cmp_lt_f16_e32 vcc, s6, v3
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_v2f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0
@@ -874,7 +874,7 @@ define amdgpu_kernel void @select_v2f16_imm_a(
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -942,28 +942,28 @@ define amdgpu_kernel void @select_v2f16_imm_b(
;
; VI-LABEL: select_v2f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
-; VI-NEXT: s_movk_i32 s2, 0x3900
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0
+; VI-NEXT: s_movk_i32 s6, 0x3900
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0
@@ -971,36 +971,36 @@ define amdgpu_kernel void @select_v2f16_imm_b(
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_cmp_gt_f16_e32 vcc, s2, v3
+; VI-NEXT: v_cmp_gt_f16_e32 vcc, s6, v3
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_v2f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0
@@ -1015,7 +1015,7 @@ define amdgpu_kernel void @select_v2f16_imm_b(
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1083,29 +1083,29 @@ define amdgpu_kernel void @select_v2f16_imm_c(
;
; VI-LABEL: select_v2f16_imm_c:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s15, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0
; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
; VI-NEXT: v_mov_b32_e32 v4, 0x3900
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -1118,32 +1118,32 @@ define amdgpu_kernel void @select_v2f16_imm_c(
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_v2f16_imm_c:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(1)
@@ -1158,7 +1158,7 @@ define amdgpu_kernel void @select_v2f16_imm_c(
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1226,29 +1226,29 @@ define amdgpu_kernel void @select_v2f16_imm_d(
;
; VI-LABEL: select_v2f16_imm_d:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s15, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0
; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
; VI-NEXT: v_mov_b32_e32 v4, 0x3900
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -1261,32 +1261,32 @@ define amdgpu_kernel void @select_v2f16_imm_d(
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_v2f16_imm_d:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(1)
@@ -1301,7 +1301,7 @@ define amdgpu_kernel void @select_v2f16_imm_d(
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index b3f4790..232c05e 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -29,17 +29,17 @@ define amdgpu_kernel void @shl_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; VI-LABEL: shl_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s5, s5, s7
-; VI-NEXT: s_lshl_b32 s4, s4, s6
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b32 s1, s1, s3
+; VI-NEXT: s_lshl_b32 s0, s0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: shl_v2i32:
@@ -159,21 +159,21 @@ define amdgpu_kernel void @shl_i16(ptr addrspace(1) %out, ptr addrspace(1) %in)
;
; VI-LABEL: shl_i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:2
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: shl_i16:
@@ -396,29 +396,29 @@ define amdgpu_kernel void @shl_i16_computed_amount(ptr addrspace(1) %out, ptr ad
;
; VI-LABEL: shl_i16_computed_amount:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_ushort v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_add_u16_e32 v0, 3, v0
; VI-NEXT: v_lshlrev_b16_e32 v0, v0, v2
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: shl_i16_computed_amount:
@@ -484,14 +484,14 @@ define amdgpu_kernel void @shl_i16_i_s(ptr addrspace(1) %out, i16 zeroext %a) {
;
; VI-LABEL: shl_i16_i_s:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s4, s4, 12
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: s_lshl_b32 s0, s2, 12
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: shl_i16_i_s:
@@ -561,26 +561,26 @@ define amdgpu_kernel void @shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; VI-LABEL: shl_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_load_dword s4, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s5, s4, 16
-; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: s_lshr_b32 s1, s0, 16
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e64 v2, v0, s4
+; VI-NEXT: v_lshlrev_b16_e64 v2, v0, s0
; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v2, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: shl_v2i16:
@@ -659,15 +659,15 @@ define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; VI-LABEL: shl_v4i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1
@@ -770,16 +770,16 @@ define amdgpu_kernel void @shl_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
;
; VI-LABEL: shl_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: shl_i64:
@@ -1041,14 +1041,14 @@ define amdgpu_kernel void @s_shl_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a
;
; VI-LABEL: s_shl_32_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_32_i64:
@@ -1153,18 +1153,18 @@ define amdgpu_kernel void @s_shl_constant_i64(ptr addrspace(1) %out, i64 %a) {
;
; VI-LABEL: s_shl_constant_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s9, 0xffff
-; VI-NEXT: s_mov_b32 s8, s6
-; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s8, s2
+; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_lshl_b64 s[0:1], s[8:9], s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_lshl_b64 s[4:5], s[8:9], s6
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_constant_i64:
@@ -1215,20 +1215,20 @@ define amdgpu_kernel void @v_shl_constant_i64(ptr addrspace(1) %out, ptr addrspa
;
; VI-LABEL: v_shl_constant_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_mov_b32 s0, 0xab19b207
-; VI-NEXT: s_movk_i32 s1, 0x11e
+; VI-NEXT: s_load_dword s6, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s4, 0xab19b207
+; VI-NEXT: s_movk_i32 s5, 0x11e
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_shl_constant_i64:
@@ -1285,16 +1285,16 @@ define amdgpu_kernel void @v_shl_i64_32_bit_constant(ptr addrspace(1) %out, ptr
;
; VI-LABEL: v_shl_i64_32_bit_constant:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 0x12d687, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 0x12d687, s0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_shl_i64_32_bit_constant:
@@ -1349,16 +1349,16 @@ define amdgpu_kernel void @v_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad
;
; VI-LABEL: v_shl_inline_imm_64_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 64, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 64, s0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_shl_inline_imm_64_i64:
@@ -1407,15 +1407,15 @@ define amdgpu_kernel void @s_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad
;
; VI-LABEL: s_shl_inline_imm_64_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 64, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 64, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_64_i64:
@@ -1457,15 +1457,15 @@ define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr add
;
; VI-LABEL: s_shl_inline_imm_1_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 1, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 1, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_1_i64:
@@ -1508,15 +1508,15 @@ define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: s_shl_inline_imm_1_0_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 1.0, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 1.0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_1_0_i64:
@@ -1555,15 +1555,15 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(ptr addrspace(1) %out, p
;
; VI-LABEL: s_shl_inline_imm_neg_1_0_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], -1.0, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], -1.0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_neg_1_0_i64:
@@ -1602,15 +1602,15 @@ define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: s_shl_inline_imm_0_5_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 0.5, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 0.5, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_0_5_i64:
@@ -1649,15 +1649,15 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(ptr addrspace(1) %out, p
;
; VI-LABEL: s_shl_inline_imm_neg_0_5_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], -0.5, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], -0.5, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_neg_0_5_i64:
@@ -1696,15 +1696,15 @@ define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: s_shl_inline_imm_2_0_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 2.0, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 2.0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_2_0_i64:
@@ -1743,15 +1743,15 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(ptr addrspace(1) %out, p
;
; VI-LABEL: s_shl_inline_imm_neg_2_0_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], -2.0, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], -2.0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_neg_2_0_i64:
@@ -1790,15 +1790,15 @@ define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: s_shl_inline_imm_4_0_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 4.0, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 4.0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_4_0_i64:
@@ -1837,15 +1837,15 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(ptr addrspace(1) %out, p
;
; VI-LABEL: s_shl_inline_imm_neg_4_0_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], -4.0, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], -4.0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_neg_4_0_i64:
@@ -1887,15 +1887,15 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, p
;
; VI-LABEL: s_shl_inline_imm_f32_4_0_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 0x40800000, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 0x40800000, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_f32_4_0_i64:
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index 1384fb0..05948d8 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -90,26 +90,26 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2
define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: v_shl_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshlrev_b16 v0, v1, v0
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_shl_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v4, v1, v0
@@ -142,24 +142,24 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX10-LABEL: v_shl_v2i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshlrev_b16 v0, v1, v0
-; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v2, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_shl_v2i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshlrev_b16 v0, v1, v0
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -364,27 +364,27 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: shl_imm_v_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: shl_imm_v_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: v_mov_b32_e32 v4, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e64 v2, v3, 8
@@ -416,24 +416,24 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(
;
; GFX10-LABEL: shl_imm_v_v2i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: shl_imm_v_v2i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -450,26 +450,26 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: shl_v_imm_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: shl_v_imm_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
@@ -498,24 +498,24 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(
;
; GFX10-LABEL: shl_v_imm_v2i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: shl_v_imm_v2i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -532,27 +532,27 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: v_shl_v4i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshlrev_b16 v1, v3, v1
; GFX9-NEXT: v_pk_lshlrev_b16 v0, v2, v0
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_shl_v4i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1
@@ -595,26 +595,26 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX10-LABEL: v_shl_v4i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshlrev_b16 v1, v3, v1
; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0
-; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_shl_v4i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3]
+; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshlrev_b16 v1, v3, v1
; GFX11-NEXT: v_pk_lshlrev_b16 v0, v2, v0
-; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -633,27 +633,27 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %
define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: shl_v_imm_v4i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: shl_v_imm_v4i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1
@@ -692,26 +692,26 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(
;
; GFX10-LABEL: shl_v_imm_v4i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: shl_v_imm_v4i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 1a55bf6..5af7dfe 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -48,15 +48,15 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp
;
; VI-SDAG-LABEL: v_test_i32_x_sub_64:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3
@@ -65,16 +65,16 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp
;
; VI-GISEL-LABEL: v_test_i32_x_sub_64:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -84,35 +84,35 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_test_i32_x_sub_64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_x_sub_64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i32_x_sub_64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 64, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -173,18 +173,18 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out,
;
; VI-SDAG-LABEL: v_test_i32_x_sub_64_multi_use:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v4, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3
; VI-SDAG-NEXT: v_subrev_u32_e32 v3, vcc, 64, v4
@@ -196,19 +196,19 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out,
;
; VI-GISEL-LABEL: v_test_i32_x_sub_64_multi_use:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v4, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3
@@ -221,52 +221,52 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_test_i32_x_sub_64_multi_use:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1
; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v2
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_x_sub_64_multi_use:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1
; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v2
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v2, s[4:5]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i32_x_sub_64_multi_use:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 64, v1
; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 64, v2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] dlc
+; GFX11-NEXT: global_store_b32 v0, v2, s[4:5] dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -319,15 +319,15 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp
;
; VI-SDAG-LABEL: v_test_i32_64_sub_x:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_sub_u32_e32 v2, vcc, 64, v3
@@ -336,16 +336,16 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp
;
; VI-GISEL-LABEL: v_test_i32_64_sub_x:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -355,35 +355,35 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_test_i32_64_sub_x:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v1, 64, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_64_sub_x:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 64, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i32_64_sub_x:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sub_nc_u32_e32 v1, 64, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -432,15 +432,15 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp
;
; VI-SDAG-LABEL: v_test_i32_x_sub_65:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, 0xffffffbf, v3
@@ -449,16 +449,16 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp
;
; VI-GISEL-LABEL: v_test_i32_x_sub_65:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -468,70 +468,70 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-SDAG-LABEL: v_test_i32_x_sub_65:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_add_u32_e32 v1, 0xffffffbf, v1
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_i32_x_sub_65:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 0x41, v1
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: v_test_i32_x_sub_65:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1
-; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_test_i32_x_sub_65:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0x41, v1
-; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_i32_x_sub_65:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_i32_x_sub_65:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0x41, v1
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -580,15 +580,15 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp
;
; VI-SDAG-LABEL: v_test_i32_65_sub_x:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_sub_u32_e32 v2, vcc, 0x41, v3
@@ -597,16 +597,16 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp
;
; VI-GISEL-LABEL: v_test_i32_65_sub_x:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -616,35 +616,35 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_test_i32_65_sub_x:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v1, 0x41, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_65_sub_x:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0x41, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i32_65_sub_x:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0x41, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -693,15 +693,15 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add
;
; VI-SDAG-LABEL: v_test_i32_x_sub_neg16:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, 16, v3
@@ -710,16 +710,16 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add
;
; VI-GISEL-LABEL: v_test_i32_x_sub_neg16:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -729,70 +729,70 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add
;
; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg16:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_add_u32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg16:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, -16, v1
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg16:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: v_add_nc_u32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg16:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, -16, v1
-; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg16:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 16, v1
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg16:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_subrev_nc_u32_e32 v1, -16, v1
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -841,15 +841,15 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add
;
; VI-SDAG-LABEL: v_test_i32_neg16_sub_x:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_sub_u32_e32 v2, vcc, -16, v3
@@ -858,16 +858,16 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add
;
; VI-GISEL-LABEL: v_test_i32_neg16_sub_x:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -877,35 +877,35 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add
;
; GFX9-LABEL: v_test_i32_neg16_sub_x:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v1, -16, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_neg16_sub_x:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u32_e32 v1, -16, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i32_neg16_sub_x:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sub_nc_u32_e32 v1, -16, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -954,15 +954,15 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add
;
; VI-SDAG-LABEL: v_test_i32_x_sub_neg17:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, 17, v3
@@ -971,16 +971,16 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add
;
; VI-GISEL-LABEL: v_test_i32_x_sub_neg17:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -990,70 +990,70 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add
;
; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg17:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_add_u32_e32 v1, 17, v1
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg17:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 0xffffffef, v1
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg17:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: v_add_nc_u32_e32 v1, 17, v1
-; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg17:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0xffffffef, v1
-; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg17:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 17, v1
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg17:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0xffffffef, v1
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1102,15 +1102,15 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add
;
; VI-SDAG-LABEL: v_test_i32_neg17_sub_x:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_sub_u32_e32 v2, vcc, 0xffffffef, v3
@@ -1119,16 +1119,16 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add
;
; VI-GISEL-LABEL: v_test_i32_neg17_sub_x:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1138,35 +1138,35 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add
;
; GFX9-LABEL: v_test_i32_neg17_sub_x:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v1, 0xffffffef, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_neg17_sub_x:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0xffffffef, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i32_neg17_sub_x:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0xffffffef, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1270,15 +1270,15 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
;
; VI-SDAG-LABEL: v_test_i16_x_sub_64:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_ushort v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_subrev_u16_e32 v2, 64, v3
@@ -1287,16 +1287,16 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
;
; VI-GISEL-LABEL: v_test_i16_x_sub_64:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_ushort v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1306,35 +1306,35 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_test_i16_x_sub_64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i16_x_sub_64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i16_x_sub_64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sub_nc_u16 v1, v1, 64
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1387,16 +1387,16 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
;
; VI-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-SDAG-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-SDAG-NEXT: flat_load_ushort v2, v[1:2]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_subrev_u16_e32 v2, 64, v2
@@ -1405,17 +1405,17 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
;
; VI-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 1, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s7
; VI-GISEL-NEXT: v_add_u32_e32 v1, vcc, v1, v3
; VI-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-GISEL-NEXT: flat_load_ushort v2, v[1:2]
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1425,41 +1425,41 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
;
; GFX9-LABEL: v_test_i16_x_sub_64_zext_to_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v1, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v1, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i16_x_sub_64_zext_to_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v1, s[2:3]
+; GFX10-NEXT: global_load_ushort v1, v1, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i16_x_sub_64_zext_to_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v1, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v1, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sub_nc_u16 v1, v1, 64
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1521,18 +1521,18 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
;
; VI-SDAG-LABEL: v_test_i16_x_sub_64_multi_use:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_ushort v3, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_ushort v4, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_subrev_u16_e32 v2, 64, v3
; VI-SDAG-NEXT: v_subrev_u16_e32 v3, 64, v4
@@ -1544,19 +1544,19 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
;
; VI-GISEL-LABEL: v_test_i16_x_sub_64_multi_use:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_ushort v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_ushort v4, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_subrev_u16_e32 v2, 64, v3
@@ -1569,52 +1569,52 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_test_i16_x_sub_64_multi_use:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1
; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v2
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_short v0, v2, s[0:1]
+; GFX9-NEXT: global_store_short v0, v2, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i16_x_sub_64_multi_use:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64
; GFX10-NEXT: v_sub_nc_u16 v2, v2, 64
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-NEXT: global_store_short v0, v2, s[4:5]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i16_x_sub_64_multi_use:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v2, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sub_nc_u16 v1, v1, 64
; GFX11-NEXT: v_sub_nc_u16 v2, v2, 64
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] dlc
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_store_b16 v0, v2, s[0:1] dlc
+; GFX11-NEXT: global_store_b16 v0, v2, s[4:5] dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1676,16 +1676,16 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a
;
; VI-SDAG-LABEL: v_test_v2i16_x_sub_64_64:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 64
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -1696,17 +1696,17 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a
;
; VI-GISEL-LABEL: v_test_v2i16_x_sub_64_64:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 64
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1718,35 +1718,35 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a
;
; GFX9-LABEL: v_test_v2i16_x_sub_64_64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_64_64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_sub_64_64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1804,16 +1804,16 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad
;
; VI-SDAG-LABEL: v_test_v2i16_x_sub_7_64:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 64
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_e32 v2, -7, v3
@@ -1824,17 +1824,17 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad
;
; VI-GISEL-LABEL: v_test_v2i16_x_sub_7_64:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 64
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1846,48 +1846,48 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad
;
; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_7_64:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_mov_b32 s0, 0x400007
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT: s_mov_b32 s2, 0x400007
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s2
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s0
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_7_64:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x400007
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_pk_sub_i16 v1, v1, v2
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_7_64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x400007
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_sub_7_64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0x400007
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1945,16 +1945,16 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr
;
; VI-SDAG-LABEL: v_test_v2i16_x_sub_64_123:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff85
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -1965,17 +1965,17 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr
;
; VI-GISEL-LABEL: v_test_v2i16_x_sub_64_123:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7b
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1987,48 +1987,48 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr
;
; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_64_123:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_mov_b32 s0, 0x7b0040
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT: s_mov_b32 s2, 0x7b0040
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s2
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s0
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_64_123:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x7b0040
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_pk_sub_i16 v1, v1, v2
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_64_123:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x7b0040
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_sub_64_123:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0x7b0040
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2084,15 +2084,15 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add
;
; VI-SDAG-LABEL: v_test_v2i16_x_sub_7_0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
@@ -2103,17 +2103,17 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add
;
; VI-GISEL-LABEL: v_test_v2i16_x_sub_7_0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v3
@@ -2125,35 +2125,35 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add
;
; GFX9-LABEL: v_test_v2i16_x_sub_7_0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v1, v1, 7
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_7_0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, v1, 7
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_sub_7_0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_i16 v1, v1, 7
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2208,16 +2208,16 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad
;
; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_16:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_mov_b32_e32 v2, -16
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2227,18 +2227,18 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad
;
; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_16:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 16
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2248,35 +2248,35 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad
;
; GFX9-LABEL: v_test_v2i16_x_sub_0_16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_0_16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_sub_0_16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2330,16 +2330,16 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a
;
; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x3c00
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2349,18 +2349,18 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a
;
; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0xffffc400
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2370,48 +2370,48 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a
;
; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_brev_b32 s0, 35
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT: s_brev_b32 s2, 35
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s2
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s0
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v2, 35
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_pk_sub_i16 v1, v1, v2
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_0_1_0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0xc4000000
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_sub_0_1_0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0xc4000000
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2465,16 +2465,16 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt
;
; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0xffffbc00
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2484,18 +2484,18 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt
;
; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4400
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2505,48 +2505,48 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt
;
; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_brev_b32 s0, 34
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT: s_brev_b32 s2, 34
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s2
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s0
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v2, 34
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_pk_sub_i16 v1, v1, v2
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_0_neg1_0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x44000000
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_sub_0_neg1_0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0x44000000
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2605,16 +2605,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out,
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_neg32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 32
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2625,17 +2625,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out,
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_neg32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffffe0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -2647,35 +2647,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg32_neg32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_neg32_neg32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2729,16 +2729,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_0_neg32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 32
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2748,18 +2748,18 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_0_neg32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0xffffffe0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2769,35 +2769,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_test_v2i16_x_add_0_neg32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_0_neg32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_0_neg32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2852,15 +2852,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
@@ -2871,17 +2871,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v3
@@ -2893,35 +2893,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_test_v2i16_x_add_neg32_0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg32_0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_neg32_0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2980,16 +2980,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out,
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg16_neg16:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, -16
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_e32 v2, -16, v3
@@ -3000,17 +3000,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out,
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg16_neg16:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, -16
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -3022,35 +3022,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg16_neg16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_neg16_neg16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3104,16 +3104,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_0_neg16:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_mov_b32_e32 v2, -16
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -3123,18 +3123,18 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_0_neg16:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_mov_b32_e32 v2, -16
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -3144,35 +3144,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_test_v2i16_x_add_0_neg16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_0_neg16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_0_neg16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3227,15 +3227,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg16_0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
@@ -3246,17 +3246,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg16_0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v3
@@ -3268,35 +3268,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_test_v2i16_x_add_neg16_0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_u16 v1, v1, 16
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg16_0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_neg16_0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_u16 v1, v1, 16
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3354,16 +3354,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffc400
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_e32 v2, 0xc400, v3
@@ -3374,17 +3374,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffc400
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -3396,72 +3396,72 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p
;
; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_movk_i32 s0, 0xc400
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT: s_movk_i32 s2, 0xc400
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s2 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s0 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xc400c400
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, v2
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: v_pk_add_u16 v1, 0xffffc400, v1 op_sel_hi:[0,1]
-; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0xc400c400, v1
-; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_pk_add_u16 v1, 0xffffc400, v1 op_sel_hi:[0,1]
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0xc400c400, v1
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -3519,16 +3519,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x4400
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_e32 v2, 0x4400, v3
@@ -3539,17 +3539,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x4400
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -3561,72 +3561,72 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out
;
; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_movk_i32 s0, 0x4400
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT: s_movk_i32 s2, 0x4400
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s2 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s0 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x44004400
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, v2
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1]
-; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0x44004400, v1
-; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1]
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0x44004400, v1
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -3684,16 +3684,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_fptwo:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x4000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_e32 v2, 0x4000, v3
@@ -3704,17 +3704,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_fptwo:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x4000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -3726,35 +3726,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p
;
; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg_fptwo:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_neg_fptwo:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3812,16 +3812,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfptwo:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffc000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_e32 v2, 0xc000, v3
@@ -3832,17 +3832,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfptwo:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffc000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -3854,35 +3854,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out
;
; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg_negfptwo:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_neg_negfptwo:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3935,15 +3935,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 32
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -3953,19 +3953,19 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_undef_neg32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT: s_and_b32 s0, 0xffff, s0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0xffffffe0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
-; VI-GISEL-NEXT: s_and_b32 s0, 0xffff, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -3975,35 +3975,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_undef_neg32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_undef_neg32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4054,15 +4054,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out,
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_subrev_u16_e32 v2, 32, v3
@@ -4071,19 +4071,19 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out,
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT: s_and_b32 s0, 0xffff, s0
+; VI-GISEL-NEXT: s_lshl_b32 s0, s0, 16
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: s_and_b32 s2, 0xffff, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT: s_lshl_b32 s0, s2, 16
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_add_u16_e32 v2, 0xffe0, v3
@@ -4093,71 +4093,71 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out,
;
; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffffffe0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, v2
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32
-; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0xffffffe0, v1
-; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0xffffffe0, v1
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
index 1ab6376..3dcdfeb 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -45,12 +45,12 @@ define amdgpu_kernel void @break_inserted_outside_of_loop(ptr addrspace(1) %out,
; FLAT-NEXT: s_cbranch_execnz .LBB0_1
; FLAT-NEXT: ; %bb.2: ; %ENDLOOP
; FLAT-NEXT: s_or_b64 exec, exec, s[2:3]
-; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; FLAT-NEXT: s_mov_b32 s3, 0xf000
-; FLAT-NEXT: s_mov_b32 s2, -1
+; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; FLAT-NEXT: s_mov_b32 s7, 0xf000
+; FLAT-NEXT: s_mov_b32 s6, -1
; FLAT-NEXT: v_mov_b32_e32 v0, 0
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
-; FLAT-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0
; FLAT-NEXT: s_endpgm
main_body:
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
index 33249e4..1aa3da9 100644
--- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
@@ -130,15 +130,15 @@ define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) noun
;
; VI-LABEL: s_sext_i32_to_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s5, s4, 31
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_ashr_i32 s0, s2, 31
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%sext = sext i32 %a to i64
store i64 %sext, ptr addrspace(1) %out, align 8
@@ -166,20 +166,20 @@ define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspac
;
; VI-LABEL: v_sext_i32_to_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4
%sext = sext i32 %val to i64
@@ -203,15 +203,15 @@ define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) noun
;
; VI-LABEL: s_sext_i16_to_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x100000
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%sext = sext i16 %a to i64
store i64 %sext, ptr addrspace(1) %out, align 8
@@ -276,17 +276,17 @@ define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32
; VI-LABEL: s_sext_i1_to_i16_with_and:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s4, s5
-; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: s_cmp_eq_u32 s6, s7
-; VI-NEXT: s_cselect_b64 s[6:7], -1, 0
-; VI-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
+; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
%cmp0 = icmp eq i32 %a, %b
%cmp1 = icmp eq i32 %c, %d
@@ -375,26 +375,26 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) n
;
; VI-LABEL: s_sext_v4i8_to_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s4
-; VI-NEXT: s_ashr_i32 s5, s4, 24
-; VI-NEXT: s_bfe_i32 s6, s4, 0x80010
-; VI-NEXT: s_sext_i32_i8 s4, s4
+; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s2
+; VI-NEXT: s_ashr_i32 s0, s2, 24
+; VI-NEXT: s_bfe_i32 s1, s2, 0x80010
+; VI-NEXT: s_sext_i32_i8 s2, s2
; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
-; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s5
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%cast = bitcast i32 %a to <4 x i8>
@@ -443,30 +443,30 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: v_sext_v4i8_to_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0
; VI-NEXT: v_ashrrev_i32_e32 v2, 24, v0
; VI-NEXT: v_bfe_i32 v3, v0, 16, 8
; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
; VI-NEXT: v_bfe_i32 v1, v1, 0, 8
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%a = load i32, ptr addrspace(1) %in
@@ -513,27 +513,27 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a)
;
; VI-LABEL: s_sext_v4i16_to_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_ashr_i32 s1, s2, 16
-; VI-NEXT: s_sext_i32_i16 s2, s2
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: s_ashr_i32 s0, s3, 16
-; VI-NEXT: s_sext_i32_i16 s3, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_ashr_i32 s5, s6, 16
+; VI-NEXT: s_sext_i32_i16 s6, s6
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_ashr_i32 s4, s7, 16
+; VI-NEXT: s_sext_i32_i16 s7, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: v_mov_b32_e32 v0, s5
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: v_mov_b32_e32 v0, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%cast = bitcast i64 %a to <4 x i16>
@@ -580,29 +580,29 @@ define amdgpu_kernel void @v_sext_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addr
;
; VI-LABEL: v_sext_v4i16_to_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
; VI-NEXT: v_ashrrev_i32_e32 v2, 16, v1
; VI-NEXT: v_bfe_i32 v1, v1, 0, 16
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%a = load i64, ptr addrspace(1) %in
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index 539cfc7..4770a35 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -42,25 +42,25 @@ define amdgpu_kernel void @test_simple_indirect_call() {
;
; GFX9-LABEL: test_simple_indirect_call:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GFX9-NEXT: s_add_u32 s0, s0, s15
; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b32 s4, s4, 16
-; GFX9-NEXT: s_mul_i32 s4, s4, s5
+; GFX9-NEXT: s_lshr_b32 s4, s6, 16
+; GFX9-NEXT: s_mul_i32 s4, s4, s7
; GFX9-NEXT: v_mul_lo_u32 v0, s4, v0
-; GFX9-NEXT: s_getpc_b64 s[6:7]
-; GFX9-NEXT: s_add_u32 s6, s6, indirect@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s7, s7, indirect@rel32@hi+12
-; GFX9-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-NEXT: v_mov_b32_e32 v4, s7
-; GFX9-NEXT: v_mad_u32_u24 v0, v1, s5, v0
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, indirect@rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, indirect@rel32@hi+12
+; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: v_mov_b32_e32 v4, s5
+; GFX9-NEXT: v_mad_u32_u24 v0, v1, s7, v0
; GFX9-NEXT: v_add_lshl_u32 v0, v0, v2, 3
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: ds_write_b64 v0, v[3:4]
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
%fptr = alloca ptr, addrspace(5)
%fptr.cast = addrspacecast ptr addrspace(5) %fptr to ptr
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
index b037268..ba0f4c8 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
@@ -32,50 +32,50 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i
;
; GFX8-LABEL: s_sint_to_fp_i64_to_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_xor_b32 s5, s2, s3
-; GFX8-NEXT: s_flbit_i32 s4, s3
-; GFX8-NEXT: s_ashr_i32 s5, s5, 31
-; GFX8-NEXT: s_add_i32 s4, s4, -1
-; GFX8-NEXT: s_add_i32 s5, s5, 32
-; GFX8-NEXT: s_min_u32 s4, s4, s5
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2
-; GFX8-NEXT: s_sub_i32 s2, 32, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_ldexp_f32 v0, v0, s2
+; GFX8-NEXT: s_xor_b32 s1, s6, s7
+; GFX8-NEXT: s_flbit_i32 s0, s7
+; GFX8-NEXT: s_ashr_i32 s1, s1, 31
+; GFX8-NEXT: s_add_i32 s0, s0, -1
+; GFX8-NEXT: s_add_i32 s1, s1, 32
+; GFX8-NEXT: s_min_u32 s2, s0, s1
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_ldexp_f32 v0, v0, s0
; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX11-LABEL: s_sint_to_fp_i64_to_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s4, s2, s3
-; GFX11-NEXT: s_cls_i32 s5, s3
-; GFX11-NEXT: s_ashr_i32 s4, s4, 31
-; GFX11-NEXT: s_add_i32 s5, s5, -1
-; GFX11-NEXT: s_add_i32 s4, s4, 32
+; GFX11-NEXT: s_xor_b32 s0, s6, s7
+; GFX11-NEXT: s_cls_i32 s1, s7
+; GFX11-NEXT: s_ashr_i32 s0, s0, 31
+; GFX11-NEXT: s_add_i32 s1, s1, -1
+; GFX11-NEXT: s_add_i32 s0, s0, 32
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s4, s5, s4
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
+; GFX11-NEXT: s_min_u32 s2, s1, s0
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s2, s2, 1
-; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_min_u32 s0, s0, 1
+; GFX11-NEXT: s_or_b32 s0, s1, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
-; GFX11-NEXT: s_sub_i32 s2, 32, s4
+; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX11-NEXT: s_sub_i32 s0, 32, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
+; GFX11-NEXT: v_ldexp_f32 v0, v0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -116,12 +116,12 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
;
; GFX8-LABEL: v_sint_to_fp_i64_to_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -136,8 +136,8 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
; GFX8-NEXT: v_min_u32_e32 v1, 1, v1
; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; GFX8-NEXT: v_ldexp_f32 v1, v1, v3
; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v1
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
@@ -146,11 +146,11 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
;
; GFX11-LABEL: v_sint_to_fp_i64_to_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3]
+; GFX11-NEXT: global_load_b64 v[1:2], v1, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_xor_b32_e32 v3, v1, v2
; GFX11-NEXT: v_cls_i32_e32 v4, v2
@@ -170,7 +170,7 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_ldexp_f32 v1, v1, v2
; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -209,47 +209,47 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i
;
; GFX8-LABEL: s_sint_to_fp_i64_to_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_xor_b32 s5, s2, s3
-; GFX8-NEXT: s_flbit_i32 s4, s3
-; GFX8-NEXT: s_ashr_i32 s5, s5, 31
-; GFX8-NEXT: s_add_i32 s4, s4, -1
-; GFX8-NEXT: s_add_i32 s5, s5, 32
-; GFX8-NEXT: s_min_u32 s4, s4, s5
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_sub_i32 s0, 32, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_xor_b32 s1, s6, s7
+; GFX8-NEXT: s_flbit_i32 s0, s7
+; GFX8-NEXT: s_ashr_i32 s1, s1, 31
+; GFX8-NEXT: s_add_i32 s0, s0, -1
+; GFX8-NEXT: s_add_i32 s1, s1, 32
+; GFX8-NEXT: s_min_u32 s2, s0, s1
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_ldexp_f32 v2, v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX11-LABEL: s_sint_to_fp_i64_to_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s4, s2, s3
-; GFX11-NEXT: s_cls_i32 s5, s3
-; GFX11-NEXT: s_ashr_i32 s4, s4, 31
-; GFX11-NEXT: s_add_i32 s5, s5, -1
-; GFX11-NEXT: s_add_i32 s4, s4, 32
+; GFX11-NEXT: s_xor_b32 s0, s6, s7
+; GFX11-NEXT: s_cls_i32 s1, s7
+; GFX11-NEXT: s_ashr_i32 s0, s0, 31
+; GFX11-NEXT: s_add_i32 s1, s1, -1
+; GFX11-NEXT: s_add_i32 s0, s0, 32
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s4, s5, s4
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
+; GFX11-NEXT: s_min_u32 s2, s1, s0
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s2, s2, 1
-; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_min_u32 s0, s0, 1
+; GFX11-NEXT: s_or_b32 s0, s1, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
-; GFX11-NEXT: s_sub_i32 s2, 32, s4
+; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX11-NEXT: s_sub_i32 s0, 32, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_ldexp_f32 v0, v0, s0
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -289,12 +289,12 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
;
; GFX8-LABEL: v_sint_to_fp_i64_to_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -305,11 +305,11 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; GFX8-NEXT: v_min_u32_e32 v4, v4, v0
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s5
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_cvt_f32_i32_e32 v5, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v3
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
; GFX8-NEXT: v_ldexp_f32 v2, v5, v2
@@ -318,11 +318,11 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
;
; GFX11-LABEL: v_sint_to_fp_i64_to_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3]
+; GFX11-NEXT: global_load_b64 v[1:2], v1, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_xor_b32_e32 v3, v1, v2
; GFX11-NEXT: v_cls_i32_e32 v4, v2
@@ -341,7 +341,7 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_ldexp_f32 v1, v1, v2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -392,34 +392,34 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2
; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_xor_b32 s3, s6, s7
-; GFX8-NEXT: s_flbit_i32 s2, s7
-; GFX8-NEXT: s_ashr_i32 s3, s3, 31
-; GFX8-NEXT: s_add_i32 s2, s2, -1
-; GFX8-NEXT: s_add_i32 s3, s3, 32
-; GFX8-NEXT: s_min_u32 s9, s2, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s9
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2
-; GFX8-NEXT: s_xor_b32 s2, s4, s5
+; GFX8-NEXT: s_xor_b32 s1, s6, s7
+; GFX8-NEXT: s_flbit_i32 s0, s7
+; GFX8-NEXT: s_ashr_i32 s1, s1, 31
+; GFX8-NEXT: s_add_i32 s0, s0, -1
+; GFX8-NEXT: s_add_i32 s1, s1, 32
+; GFX8-NEXT: s_min_u32 s9, s0, s1
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s9
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX8-NEXT: s_xor_b32 s0, s4, s5
; GFX8-NEXT: s_flbit_i32 s8, s5
-; GFX8-NEXT: s_ashr_i32 s2, s2, 31
+; GFX8-NEXT: s_ashr_i32 s0, s0, 31
; GFX8-NEXT: s_add_i32 s8, s8, -1
-; GFX8-NEXT: s_add_i32 s2, s2, 32
-; GFX8-NEXT: s_min_u32 s6, s8, s2
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s6
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s2
-; GFX8-NEXT: s_sub_i32 s2, 32, s9
-; GFX8-NEXT: v_ldexp_f32 v1, v0, s2
-; GFX8-NEXT: s_sub_i32 s2, 32, s6
-; GFX8-NEXT: v_ldexp_f32 v0, v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_i32 s0, s0, 32
+; GFX8-NEXT: s_min_u32 s6, s8, s0
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[4:5], s6
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s9
+; GFX8-NEXT: v_ldexp_f32 v1, v0, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s6
+; GFX8-NEXT: v_ldexp_f32 v0, v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -427,35 +427,35 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s3, s6, s7
+; GFX11-NEXT: s_xor_b32 s1, s6, s7
; GFX11-NEXT: s_xor_b32 s9, s4, s5
-; GFX11-NEXT: s_cls_i32 s2, s7
+; GFX11-NEXT: s_cls_i32 s0, s7
; GFX11-NEXT: s_cls_i32 s8, s5
-; GFX11-NEXT: s_ashr_i32 s3, s3, 31
+; GFX11-NEXT: s_ashr_i32 s1, s1, 31
; GFX11-NEXT: s_ashr_i32 s9, s9, 31
-; GFX11-NEXT: s_add_i32 s2, s2, -1
+; GFX11-NEXT: s_add_i32 s0, s0, -1
; GFX11-NEXT: s_add_i32 s8, s8, -1
-; GFX11-NEXT: s_add_i32 s3, s3, 32
+; GFX11-NEXT: s_add_i32 s1, s1, 32
; GFX11-NEXT: s_add_i32 s9, s9, 32
-; GFX11-NEXT: s_min_u32 s10, s2, s3
+; GFX11-NEXT: s_min_u32 s10, s0, s1
; GFX11-NEXT: s_min_u32 s8, s8, s9
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[6:7], s10
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], s10
; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
-; GFX11-NEXT: s_min_u32 s2, s2, 1
+; GFX11-NEXT: s_min_u32 s0, s0, 1
; GFX11-NEXT: s_min_u32 s4, s4, 1
-; GFX11-NEXT: s_or_b32 s2, s3, s2
-; GFX11-NEXT: s_or_b32 s3, s5, s4
-; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
-; GFX11-NEXT: v_cvt_f32_i32_e32 v2, s3
-; GFX11-NEXT: s_sub_i32 s2, 32, s10
-; GFX11-NEXT: s_sub_i32 s3, 32, s8
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s5, s4
+; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX11-NEXT: v_cvt_f32_i32_e32 v2, s1
+; GFX11-NEXT: s_sub_i32 s0, 32, s10
+; GFX11-NEXT: s_sub_i32 s1, 32, s8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_ldexp_f32 v1, v0, s2
-; GFX11-NEXT: v_ldexp_f32 v0, v2, s3
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: v_ldexp_f32 v1, v0, s0
+; GFX11-NEXT: v_ldexp_f32 v0, v2, s1
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -534,19 +534,19 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: v_sint_to_fp_v4i64_to_v4f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6]
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6]
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0
-; GFX8-NEXT: v_mov_b32_e32 v10, s1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v0
+; GFX8-NEXT: v_mov_b32_e32 v10, s5
; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_xor_b32_e32 v0, v3, v4
@@ -603,12 +603,12 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
;
; GFX11-LABEL: v_sint_to_fp_v4i64_to_v4f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16
-; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3]
+; GFX11-NEXT: global_load_b128 v[1:4], v5, s[6:7] offset:16
+; GFX11-NEXT: global_load_b128 v[5:8], v5, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_xor_b32_e32 v9, v3, v4
; GFX11-NEXT: v_xor_b32_e32 v11, v1, v2
@@ -664,7 +664,7 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
; GFX11-NEXT: v_ldexp_f32 v2, v1, v10
; GFX11-NEXT: v_ldexp_f32 v1, v6, v11
; GFX11-NEXT: v_ldexp_f32 v0, v4, v5
-; GFX11-NEXT: global_store_b128 v7, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v7, v[0:3], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -719,37 +719,37 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2
; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_xor_b32 s3, s6, s7
-; GFX8-NEXT: s_flbit_i32 s2, s7
-; GFX8-NEXT: s_ashr_i32 s3, s3, 31
-; GFX8-NEXT: s_add_i32 s2, s2, -1
-; GFX8-NEXT: s_add_i32 s3, s3, 32
-; GFX8-NEXT: s_min_u32 s8, s2, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s8
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: s_xor_b32 s3, s4, s5
-; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2
-; GFX8-NEXT: s_flbit_i32 s2, s5
-; GFX8-NEXT: s_ashr_i32 s3, s3, 31
-; GFX8-NEXT: s_add_i32 s2, s2, -1
-; GFX8-NEXT: s_add_i32 s3, s3, 32
-; GFX8-NEXT: s_min_u32 s7, s2, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s7
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_i32_e32 v1, s2
+; GFX8-NEXT: s_xor_b32 s1, s6, s7
+; GFX8-NEXT: s_flbit_i32 s0, s7
+; GFX8-NEXT: s_ashr_i32 s1, s1, 31
+; GFX8-NEXT: s_add_i32 s0, s0, -1
+; GFX8-NEXT: s_add_i32 s1, s1, 32
+; GFX8-NEXT: s_min_u32 s8, s0, s1
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s8
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: s_xor_b32 s1, s4, s5
+; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX8-NEXT: s_flbit_i32 s0, s5
+; GFX8-NEXT: s_ashr_i32 s1, s1, 31
+; GFX8-NEXT: s_add_i32 s0, s0, -1
+; GFX8-NEXT: s_add_i32 s1, s1, 32
+; GFX8-NEXT: s_min_u32 s7, s0, s1
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[4:5], s7
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_i32_e32 v1, s0
; GFX8-NEXT: s_sub_i32 s6, 32, s8
-; GFX8-NEXT: s_sub_i32 s2, 32, s7
+; GFX8-NEXT: s_sub_i32 s0, 32, s7
; GFX8-NEXT: v_ldexp_f32 v0, v0, s6
-; GFX8-NEXT: v_ldexp_f32 v1, v1, s2
+; GFX8-NEXT: v_ldexp_f32 v1, v1, s0
; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-NEXT: v_or_b32_e32 v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -757,40 +757,40 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s3, s6, s7
+; GFX11-NEXT: s_xor_b32 s1, s6, s7
; GFX11-NEXT: s_xor_b32 s9, s4, s5
-; GFX11-NEXT: s_cls_i32 s2, s7
+; GFX11-NEXT: s_cls_i32 s0, s7
; GFX11-NEXT: s_cls_i32 s8, s5
-; GFX11-NEXT: s_ashr_i32 s3, s3, 31
+; GFX11-NEXT: s_ashr_i32 s1, s1, 31
; GFX11-NEXT: s_ashr_i32 s9, s9, 31
-; GFX11-NEXT: s_add_i32 s2, s2, -1
+; GFX11-NEXT: s_add_i32 s0, s0, -1
; GFX11-NEXT: s_add_i32 s8, s8, -1
-; GFX11-NEXT: s_add_i32 s3, s3, 32
+; GFX11-NEXT: s_add_i32 s1, s1, 32
; GFX11-NEXT: s_add_i32 s9, s9, 32
-; GFX11-NEXT: s_min_u32 s10, s2, s3
+; GFX11-NEXT: s_min_u32 s10, s0, s1
; GFX11-NEXT: s_min_u32 s8, s8, s9
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[6:7], s10
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], s10
; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
-; GFX11-NEXT: s_min_u32 s2, s2, 1
+; GFX11-NEXT: s_min_u32 s0, s0, 1
; GFX11-NEXT: s_min_u32 s4, s4, 1
-; GFX11-NEXT: s_or_b32 s2, s3, s2
-; GFX11-NEXT: s_or_b32 s3, s5, s4
-; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
-; GFX11-NEXT: v_cvt_f32_i32_e32 v1, s3
-; GFX11-NEXT: s_sub_i32 s2, 32, s10
-; GFX11-NEXT: s_sub_i32 s3, 32, s8
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s5, s4
+; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX11-NEXT: v_cvt_f32_i32_e32 v1, s1
+; GFX11-NEXT: s_sub_i32 s0, 32, s10
+; GFX11-NEXT: s_sub_i32 s1, 32, s8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
-; GFX11-NEXT: v_ldexp_f32 v1, v1, s3
+; GFX11-NEXT: v_ldexp_f32 v0, v0, s0
+; GFX11-NEXT: v_ldexp_f32 v1, v1, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -877,18 +877,18 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: v_sint_to_fp_v4i64_to_v4f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6]
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6]
-; GFX8-NEXT: v_mov_b32_e32 v10, s1
+; GFX8-NEXT: v_mov_b32_e32 v10, s5
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_xor_b32_e32 v0, v3, v4
; GFX8-NEXT: v_xor_b32_e32 v12, v1, v2
@@ -943,7 +943,7 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v0
; GFX8-NEXT: v_cvt_f16_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_cvt_f16_f32_e32 v6, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v9
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v9
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v10, vcc
; GFX8-NEXT: v_or_b32_e32 v2, v4, v3
; GFX8-NEXT: v_or_b32_e32 v3, v6, v5
@@ -952,12 +952,12 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
;
; GFX11-LABEL: v_sint_to_fp_v4i64_to_v4f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16
-; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3]
+; GFX11-NEXT: global_load_b128 v[1:4], v5, s[6:7] offset:16
+; GFX11-NEXT: global_load_b128 v[5:8], v5, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_xor_b32_e32 v9, v3, v4
; GFX11-NEXT: v_xor_b32_e32 v11, v1, v2
@@ -1022,7 +1022,7 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
; GFX11-NEXT: v_pack_b32_f16 v0, v4, v2
-; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v5, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
index b4b0d96..8384576 100644
--- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
@@ -25,38 +25,38 @@ define amdgpu_kernel void @sitofp_i16_to_f16(
;
; VI-LABEL: sitofp_i16_to_f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f16_i16_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: sitofp_i16_to_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f16_i16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -91,41 +91,41 @@ define amdgpu_kernel void @sitofp_i32_to_f16(
;
; VI-LABEL: sitofp_i32_to_f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_i32_e32 v0, v0
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: sitofp_i32_to_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -168,44 +168,44 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
;
; VI-LABEL: sitofp_v2i16_to_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f16_i16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_cvt_f16_i16_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: sitofp_v2i16_to_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f16_i16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_i16_e32 v1, v1
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -244,39 +244,39 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16(
;
; VI-LABEL: sitofp_v2i32_to_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_i32_e32 v1, v1
; VI-NEXT: v_cvt_f32_i32_e32 v0, v0
; VI-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: sitofp_v2i32_to_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
@@ -285,7 +285,7 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16(
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -357,20 +357,19 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -378,9 +377,10 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_xor_b32 s0, s0, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index e1bd152..e585e6c 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -4988,1318 +4988,1317 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX9-FLATSCR-LABEL: test:
; GFX9-FLATSCR: ; %bb.0: ; %entry
; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5
-; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
-; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 13, v0
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x80
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x80
; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, s2, v5
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, s6, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s7
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v0, vcc
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_mov_b32 s4, 4
+; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT: s_mov_b32 s4, 20
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 20
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT: s_mov_b32 s4, 36
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 36
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT: s_mov_b32 s4, 52
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 52
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x44
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x54
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x64
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x74
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x100
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x100
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x84
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x94
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x180
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x180
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x104
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x104
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x114
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x114
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x124
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x124
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x134
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x134
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x144
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x144
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x154
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x154
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x164
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x164
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x174
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x174
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x200
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x200
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x184
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x184
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x194
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x194
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1a4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1b4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1c4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1d4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1e4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1f4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x280
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x280
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x204
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x204
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x214
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x214
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x224
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x224
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x234
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x234
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x244
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x244
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x254
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x254
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x264
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x264
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x274
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x274
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x300
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x300
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x284
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x284
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x294
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x294
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2a4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2b4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2c4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2d4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2e4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2f4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x380
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x380
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x304
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x304
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x314
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x314
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x324
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x324
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x334
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x334
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x344
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x344
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x354
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x354
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x364
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x364
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x374
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x374
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x400
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x400
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x384
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x384
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x394
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x394
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3a4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3b4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3c4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3d4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3e4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3]
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x404
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, s1
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7]
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x404
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, s5
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:16
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x414
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:16
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x414
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:32
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x424
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:32
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x424
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:48
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x434
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:48
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x434
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:64
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x444
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:64
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x444
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:80
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x454
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:80
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x454
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:96
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x464
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:96
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x464
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:112
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x474
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:112
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x474
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:128
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x484
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:128
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x484
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:144
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x494
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:144
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x494
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:160
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:160
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:176
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:176
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:192
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:192
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:208
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:208
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:224
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:224
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:240
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:240
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:256
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x504
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:256
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x504
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:272
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x514
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:272
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x514
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:288
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x524
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:288
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x524
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:304
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x534
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:304
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x534
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:320
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x544
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:320
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x544
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:336
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x554
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:336
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x554
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:352
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x564
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:352
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x564
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:368
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x574
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:368
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x574
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:384
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x584
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:384
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x584
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:400
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x594
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:400
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x594
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:416
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:416
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:432
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:432
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:448
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:448
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:464
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:464
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:480
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:480
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:496
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:496
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:512
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x604
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:512
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x604
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:528
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x614
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:528
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x614
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:544
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x624
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:544
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x624
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:560
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x634
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:560
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x634
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:576
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x644
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:576
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x644
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:592
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x654
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:592
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x654
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:608
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x664
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:608
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x664
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:624
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x674
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:624
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x674
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:640
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x684
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:640
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x684
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:656
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x694
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:656
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x694
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:672
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:672
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:688
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:688
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:704
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:704
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:720
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:720
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:736
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:736
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:752
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:752
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:768
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x704
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:768
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x704
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:784
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x714
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:784
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x714
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:800
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x724
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:800
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x724
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:816
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x734
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:816
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x734
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:832
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x744
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:832
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x744
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:848
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x754
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:848
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x754
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:864
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x764
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:864
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x764
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:880
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x774
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:880
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x774
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:896
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x784
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:896
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x784
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:912
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x794
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:912
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x794
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:928
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:928
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:944
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:944
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:960
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:960
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:976
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:976
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:992
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:992
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1008
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1008
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1024
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x804
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1024
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x804
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1040
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x814
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1040
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x814
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1056
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x824
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1056
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x824
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1072
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x834
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1072
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x834
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1088
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x844
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1088
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x844
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1104
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x854
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1104
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x854
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1120
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x864
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1120
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x864
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1136
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x874
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1136
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x874
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1152
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x884
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1152
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x884
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1168
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x894
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1168
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x894
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1184
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1184
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1200
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1200
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1216
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1216
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1232
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1232
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1248
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1248
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1264
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1264
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1280
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x904
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1280
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x904
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1296
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x914
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1296
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x914
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1312
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x924
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1312
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x924
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1328
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x934
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1328
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x934
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1344
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x944
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1344
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x944
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1360
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x954
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1360
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x954
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1376
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x964
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1376
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x964
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1392
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x974
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1392
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x974
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1408
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x984
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1408
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x984
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1424
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x994
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1424
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x994
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1440
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1440
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1456
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1456
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1472
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1472
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1488
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1488
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1504
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1504
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1520
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1520
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1536
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa04
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1536
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1552
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa14
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1552
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1568
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa24
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1568
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1584
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa34
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1584
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1600
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa44
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1600
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1616
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa54
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1616
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1632
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa64
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1632
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1648
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa74
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1648
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1664
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa84
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1664
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1680
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa94
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1680
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1696
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xaa4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1696
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xaa4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1712
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xab4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1712
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xab4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1728
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xac4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1728
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xac4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1744
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xad4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1744
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xad4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1760
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xae4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1760
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xae4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1776
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xaf4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1776
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xaf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1792
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb04
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1792
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1808
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb14
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1808
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1824
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb24
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1824
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1840
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb34
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1840
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1856
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb44
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1856
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1872
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb54
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1872
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1888
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb64
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1888
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1904
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb74
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1904
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1920
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb84
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1920
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1936
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb94
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1936
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1952
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xba4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1952
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xba4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbb4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1968
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbc4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1984
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbd4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2000
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbe4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2016
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbe4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbf4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2032
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc04
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2048
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc14
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2064
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc24
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2080
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2096
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc34
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2096
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2112
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc44
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2112
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2128
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc54
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2128
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2144
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc64
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2144
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2160
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc74
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2160
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2176
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc84
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2176
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2192
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc94
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2192
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2208
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xca4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2208
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xca4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2224
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xcb4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2224
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xcb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2240
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xcc4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2240
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xcc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2256
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xcd4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2256
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xcd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2272
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xce4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2272
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xce4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2288
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xcf4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2288
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xcf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2304
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd04
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2304
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2320
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd14
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2320
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2336
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd24
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2336
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2352
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd34
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2352
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2368
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd44
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2368
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2384
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd54
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2384
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2400
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd64
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2400
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2416
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd74
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2416
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2432
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd84
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2432
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2448
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd94
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2448
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2464
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xda4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2464
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xda4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2480
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xdb4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2480
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xdb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2496
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xdc4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2496
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xdc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2512
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xdd4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2512
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xdd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2528
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xde4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2528
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xde4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2544
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xdf4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2544
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xdf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2560
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe04
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2560
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2576
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe14
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2576
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2592
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe24
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2592
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2608
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe34
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2608
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2624
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe44
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2624
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2640
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe54
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2640
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2656
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe64
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2656
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2672
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe74
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2672
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2688
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe84
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2688
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2704
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe94
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2704
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2720
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xea4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2720
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xea4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2736
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xeb4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2736
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xeb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2752
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xec4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2752
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xec4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2768
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xed4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2768
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xed4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2784
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xee4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2784
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xee4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2800
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xef4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2800
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xef4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2816
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf04
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2816
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2832
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf14
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2832
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2848
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf24
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2848
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2864
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf34
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2864
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2880
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf44
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2880
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2896
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf54
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2896
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2912
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf64
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2912
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2928
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf74
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2928
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2944
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf84
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2944
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2960
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf94
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2960
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2976
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfa4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2976
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfa4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2992
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfb4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2992
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3008
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfc4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3008
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3024
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfd4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3024
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3040
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfe4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3040
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfe4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3056
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xff4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3056
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xff4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3072
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1004
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3072
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1004
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3088
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1014
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3088
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1014
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3104
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1024
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3104
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1024
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3120
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1034
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3120
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1034
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3136
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1044
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3136
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1044
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3152
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1054
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3152
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1054
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3168
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1064
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3168
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1064
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3184
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1074
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3184
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1074
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3200
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1084
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3200
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1084
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3216
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1094
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3216
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1094
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3232
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3232
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3248
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3248
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3264
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3264
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3280
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3280
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3296
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3296
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3312
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3312
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3328
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1104
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3328
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1104
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3344
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1114
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3344
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1114
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3360
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1124
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3360
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1124
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3376
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1134
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3376
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1134
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3392
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1144
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3392
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1144
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3408
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1154
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3408
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1154
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3424
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1164
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3424
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1164
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3440
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1174
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3440
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1174
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3456
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1184
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3456
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1184
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3472
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1194
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3472
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1194
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3488
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3488
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3504
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3504
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3520
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3520
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3536
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3536
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3552
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3552
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3568
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3568
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3584
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1204
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3584
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1204
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3600
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1214
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3600
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1214
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3616
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1224
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3616
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1224
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3632
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1234
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3632
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1234
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3648
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1244
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3648
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1244
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3664
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1254
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3664
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1254
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3680
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1264
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3680
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1264
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3696
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1274
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3696
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1274
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3712
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1284
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3712
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1284
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3728
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1294
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3728
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1294
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3744
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3744
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3760
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3760
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3776
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3776
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3792
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3792
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3808
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3808
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3824
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3824
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3840
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1304
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3840
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1304
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3856
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1314
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3856
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1314
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3872
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1324
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3872
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1324
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3888
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1334
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3888
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1334
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3904
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1344
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3904
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1344
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3920
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1354
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3920
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1354
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3936
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1364
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3936
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1364
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3952
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1374
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3952
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1374
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1384
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3968
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1384
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1394
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3984
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1394
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:4000
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:4016
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:4032
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:4048
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:4064
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x13e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:4080
; GFX9-FLATSCR-NEXT: ;;#ASMSTART
; GFX9-FLATSCR-NEXT: ;;#ASMEND
; GFX9-FLATSCR-NEXT: ;;#ASMSTART
@@ -6314,1035 +6313,1036 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX9-FLATSCR-NEXT: ;;#ASMEND
; GFX9-FLATSCR-NEXT: ;;#ASMSTART
; GFX9-FLATSCR-NEXT: ;;#ASMEND
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v4, vcc, s0, v5
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v4, vcc, s4, v5
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:4080
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x13d4
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:4064
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x13c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:4080
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:4048
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x13b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:4064
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:4032
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x13a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:4048
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:4016
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1394
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:4032
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:4000
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1384
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:4016
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1394
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3984
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1374
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:4000
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1384
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3968
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1364
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3984
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1374
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3952
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1354
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3968
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1364
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3936
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1344
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3952
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1354
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3920
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1334
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3936
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1344
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3904
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1324
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3920
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1334
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3888
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1314
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3904
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1324
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3872
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1304
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3888
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1314
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3856
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x12f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3872
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1304
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3840
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x12e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3856
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3824
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x12d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3840
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3808
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x12c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3824
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3792
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x12b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3808
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3776
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x12a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3792
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3760
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1294
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3776
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3744
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1284
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3760
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1294
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3728
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1274
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3744
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1284
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3712
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1264
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3728
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1274
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3696
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1254
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3712
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1264
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3680
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1244
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3696
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1254
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3664
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1234
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3680
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1244
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3648
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1224
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3664
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1234
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3632
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1214
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3648
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1224
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3616
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1204
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3632
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1214
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3600
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x11f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3616
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1204
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3584
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x11e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3600
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3568
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x11d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3584
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3552
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x11c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3568
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3536
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x11b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3552
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3520
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x11a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3536
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3504
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1194
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3520
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3488
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1184
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3504
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1194
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3472
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1174
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3488
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1184
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3456
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1164
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3472
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1174
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3440
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1154
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3456
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1164
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3424
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1144
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3440
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1154
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3408
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1134
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3424
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1144
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3392
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1124
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3408
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1134
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3376
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1114
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3392
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1124
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3360
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1104
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3376
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1114
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3344
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x10f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3360
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1104
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3328
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x10e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3344
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3312
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x10d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3328
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3296
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x10c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3312
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3280
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x10b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3296
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3264
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x10a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3280
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3248
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1094
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3264
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3232
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1084
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3248
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1094
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3216
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1074
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3232
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1084
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3200
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1064
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3216
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1074
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3184
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1054
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3200
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1064
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3168
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1044
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3184
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1054
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3152
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1034
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3168
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1044
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3136
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1024
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3152
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1034
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3120
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1014
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3136
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1024
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3104
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1004
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3120
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1014
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3088
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xff4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3104
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1004
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3072
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xfe4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3088
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xff4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3056
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xfd4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3072
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfe4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3040
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xfc4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3056
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3024
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xfb4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3040
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3008
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xfa4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3024
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2992
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf94
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3008
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfa4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2976
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf84
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2992
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2960
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf74
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2976
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2944
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf64
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2960
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2928
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf54
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2944
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2912
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf44
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2928
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2896
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf34
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2912
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2880
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf24
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2896
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2864
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf14
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2880
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2848
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf04
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2864
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2832
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xef4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2848
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2816
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xee4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2832
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xef4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2800
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xed4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2816
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xee4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2784
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xec4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2800
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xed4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2768
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xeb4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2784
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xec4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2752
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xea4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2768
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xeb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2736
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe94
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2752
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xea4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2720
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe84
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2736
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2704
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe74
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2720
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2688
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe64
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2704
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2672
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe54
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2688
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2656
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe44
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2672
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2640
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe34
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2656
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2624
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe24
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2640
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2608
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe14
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2624
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2592
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe04
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2608
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2576
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xdf4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2592
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2560
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xde4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2576
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xdf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2544
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xdd4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2560
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xde4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2528
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xdc4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2544
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xdd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2512
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xdb4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2528
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xdc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2496
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xda4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2512
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xdb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2480
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd94
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2496
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xda4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2464
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd84
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2480
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2448
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd74
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2464
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2432
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd64
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2448
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2416
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd54
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2432
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2400
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd44
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2416
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2384
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd34
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2400
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2368
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd24
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2384
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2352
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd14
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2368
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2336
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd04
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2352
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2320
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xcf4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2336
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2304
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xce4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2320
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xcf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2288
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xcd4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2304
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xce4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2272
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xcc4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2288
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xcd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2256
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xcb4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2272
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xcc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2240
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xca4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2256
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xcb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2224
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc94
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2240
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xca4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2208
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc84
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2224
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2192
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc74
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2208
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2176
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc64
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2192
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2160
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc54
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2176
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2144
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc44
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2160
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2128
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc34
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2144
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2112
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc24
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2128
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2096
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc14
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2112
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2080
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc04
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2096
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2064
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xbf4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2080
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2048
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xbe4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2064
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2032
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xbd4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2048
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbe4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2016
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xbc4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2032
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2000
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xbb4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2016
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1984
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xba4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2000
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1968
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb94
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1984
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xba4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1952
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb84
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1968
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1936
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb74
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1952
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1920
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb64
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1936
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1904
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb54
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1920
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1888
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb44
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1904
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1872
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb34
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1888
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1856
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb24
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1872
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1840
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb14
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1856
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1824
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb04
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1840
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1808
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xaf4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1824
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1792
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xae4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1808
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xaf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1776
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xad4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1792
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xae4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1760
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xac4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1776
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xad4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1744
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xab4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1760
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xac4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1728
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xaa4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1744
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xab4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1712
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa94
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1728
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xaa4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1696
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa84
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1712
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1680
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa74
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1696
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1664
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa64
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1680
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1648
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa54
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1664
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1632
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa44
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1648
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1616
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa34
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1632
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1600
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa24
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1616
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1584
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa14
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1600
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1568
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa04
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1584
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1552
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x9f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1568
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1536
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x9e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1552
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1520
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x9d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1536
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1504
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x9c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1520
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1488
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x9b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1504
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1472
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x9a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1488
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1456
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x994
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1472
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1440
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x984
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1456
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x994
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1424
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x974
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1440
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x984
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1408
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x964
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1424
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x974
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1392
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x954
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1408
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x964
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1376
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x944
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1392
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x954
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1360
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x934
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1376
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x944
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1344
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x924
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1360
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x934
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1328
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x914
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1344
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x924
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1312
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x904
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1328
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x914
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1296
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x8f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1312
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x904
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1280
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x8e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1296
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1264
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x8d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1280
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1248
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x8c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1264
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1232
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x8b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1248
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1216
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x8a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1232
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1200
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x894
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1216
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1184
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x884
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1200
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x894
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1168
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x874
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1184
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x884
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1152
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x864
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1168
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x874
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1136
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x854
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1152
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x864
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1120
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x844
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1136
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x854
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1104
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x834
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1120
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x844
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1088
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x824
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1104
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x834
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1072
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x814
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1088
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x824
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1056
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x804
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1072
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x814
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1040
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x7f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1056
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x804
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1024
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x7e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1040
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1008
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x7d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1024
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:992
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x7c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1008
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:976
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x7b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:992
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:960
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x7a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:976
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:944
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x794
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:960
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:928
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x784
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:944
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x794
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:912
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x774
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:928
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x784
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:896
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x764
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:912
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x774
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:880
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x754
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:896
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x764
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:864
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x744
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:880
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x754
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:848
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x734
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:864
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x744
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:832
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x724
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:848
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x734
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:816
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x714
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:832
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x724
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:800
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x704
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:816
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x714
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:784
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x6f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:800
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x704
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:768
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x6e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:784
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:752
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x6d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:768
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:736
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x6c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:752
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:720
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x6b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:736
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:704
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x6a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:720
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:688
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x694
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:704
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:672
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x684
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:688
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x694
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:656
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x674
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:672
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x684
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:640
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x664
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:656
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x674
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:624
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x654
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:640
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x664
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:608
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x644
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:624
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x654
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:592
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x634
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:608
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x644
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:576
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x624
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:592
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x634
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:560
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x614
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:576
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x624
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:544
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x604
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:560
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x614
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:528
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x5f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:544
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x604
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:512
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x5e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:528
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:496
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x5d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:512
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:480
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x5c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:496
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:464
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x5b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:480
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:448
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x5a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:464
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:432
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x594
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:448
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:416
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x584
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:432
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x594
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:400
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x574
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:416
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x584
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:384
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x564
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:400
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x574
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:368
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x554
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:384
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x564
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:352
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x544
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:368
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x554
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:336
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x534
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:352
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x544
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:320
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x524
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:336
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x534
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:304
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x514
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:320
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x524
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:288
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x504
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:304
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x514
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:272
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x4f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:288
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x504
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:256
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x4e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:272
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:240
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x4d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:256
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:224
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x4c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:240
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:208
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x4b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:224
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:192
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x4a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:208
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:176
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x494
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:192
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:160
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x484
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:176
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x494
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:144
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x474
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:160
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x484
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:128
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x464
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:144
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x474
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:112
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x454
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:128
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x464
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:96
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x444
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:112
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x454
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:80
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x434
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:96
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x444
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:64
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x424
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:80
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x434
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:48
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x414
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:64
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x424
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:32
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x404
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:48
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x414
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:16
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:32
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x404
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:16
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3f4
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3e4
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5]
+; GFX9-FLATSCR-NEXT: s_nop 0
; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x400, v4
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
-; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3e4
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:4080
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3d4
@@ -7617,14 +7617,14 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: s_addc_u32 s3, s3, 0
; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX10-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX10-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 13, v0
; GFX10-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLATSCR-NEXT: v_add_co_u32 v4, s4, s2, v5
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v22, null, s3, 0, s4
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x804
+; GFX10-FLATSCR-NEXT: v_add_co_u32 v4, s0, s6, v5
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v22, null, s7, 0, s0
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x804
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x80, v4
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v22, vcc_lo
; GFX10-FLATSCR-NEXT: v_add_co_u32 v2, vcc_lo, 0x100, v4
@@ -8045,795 +8045,795 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:2036 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x814
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x814
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x824
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x824
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x834
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x834
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x844
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x844
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x854
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x854
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: s_clause 0x1
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:2016
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x864
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x864
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x874
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x874
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v6
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v7, vcc_lo
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x884
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x884
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x894
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x894
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x8a4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x8a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x8b4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x8b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x8c4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x8c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x8d4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x8d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: s_clause 0x1
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:2016
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x8e4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x8e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x8f4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x8f4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v8
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v9, vcc_lo
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x904
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x904
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x914
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x914
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x924
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x924
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x934
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x934
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x944
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x944
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x954
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x954
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: s_clause 0x1
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2016
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x964
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x964
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x974
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x974
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v10
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v11, vcc_lo
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x984
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x984
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x994
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x994
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x9a4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x9a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x9b4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x9b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x9c4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x9c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x9d4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x9d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: s_clause 0x1
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2016
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x9e4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x9e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x9f4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x9f4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v12
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v13, vcc_lo
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa04
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa04
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa14
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa24
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa34
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa44
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa54
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: s_clause 0x1
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2016
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa64
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa74
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v14
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v15, vcc_lo
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa84
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa84
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa94
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xaa4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xaa4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xab4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xab4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xac4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xac4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xad4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xad4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: s_clause 0x1
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2016
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xae4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xae4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xaf4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xaf4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v16
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v17, vcc_lo
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb04
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb04
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb14
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb24
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb34
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb44
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb54
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: s_clause 0x1
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2016
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb64
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb74
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v18
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v19, vcc_lo
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb84
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb84
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb94
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xba4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xba4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xbb4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xbc4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbc4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xbd4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbd4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: s_clause 0x1
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2016
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xbe4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbe4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xbf4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbf4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3]
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc04
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7]
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc04
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:16
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc14
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:16
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:32
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc24
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:32
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:48
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc34
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:48
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:64
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc44
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:64
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:80
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc54
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:80
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:96
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc64
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:96
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:112
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc74
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:112
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:128
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc84
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:128
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc84
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:144
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc94
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:144
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:160
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xca4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:160
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xca4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:176
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xcb4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:176
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xcb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:192
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xcc4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:192
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xcc4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:208
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xcd4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:208
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xcd4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:224
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xce4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:224
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xce4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:240
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xcf4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:240
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xcf4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:256
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd04
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:256
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd04
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:272
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd14
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:272
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:288
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd24
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:288
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:304
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd34
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:304
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:320
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd44
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:320
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:336
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd54
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:336
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:352
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd64
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:352
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:368
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd74
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:368
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:384
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd84
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:384
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd84
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:400
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd94
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:400
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:416
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xda4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:416
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xda4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:432
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xdb4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:432
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xdb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:448
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xdc4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:448
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xdc4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:464
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xdd4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:464
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xdd4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:480
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xde4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:480
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xde4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:496
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xdf4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:496
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xdf4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:512
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe04
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:512
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe04
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:528
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe14
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:528
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:544
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe24
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:544
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:560
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe34
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:560
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:576
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe44
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:576
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:592
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe54
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:592
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:608
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe64
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:608
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:624
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe74
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:624
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:640
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe84
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:640
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe84
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:656
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe94
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:656
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:672
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xea4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:672
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xea4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:688
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xeb4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:688
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xeb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:704
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xec4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:704
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xec4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:720
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xed4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:720
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xed4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:736
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xee4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:736
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xee4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:752
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xef4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:752
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xef4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:768
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf04
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:768
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf04
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:784
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf14
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:784
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:800
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf24
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:800
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:816
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf34
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:816
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:832
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf44
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:832
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:848
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf54
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:848
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:864
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf64
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:864
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:880
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf74
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:880
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:896
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf84
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:896
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf84
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:912
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf94
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:912
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:928
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xfa4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:928
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfa4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:944
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xfb4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:944
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:960
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xfc4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:960
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfc4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:976
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xfd4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:976
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfd4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:992
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xfe4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:992
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfe4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1008
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xff4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1008
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xff4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1024
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1004
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1024
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1004
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1040
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1014
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1040
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1014
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1056
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1024
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1056
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1024
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1072
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1034
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1072
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1034
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1088
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1044
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1088
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1044
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1104
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1054
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1104
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1054
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1120
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1064
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1120
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1064
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1136
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1074
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1136
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1074
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1152
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1084
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1152
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1084
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1168
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1094
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1168
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1094
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1184
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x10a4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1184
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1200
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x10b4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1200
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1216
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x10c4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1216
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1232
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x10d4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1232
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1248
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x10e4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1248
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1264
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x10f4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1264
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10f4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1280
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1104
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1280
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1104
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1296
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1114
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1296
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1114
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1312
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1124
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1312
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1124
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1328
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1134
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1328
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1134
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1344
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1144
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1344
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1144
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1360
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1154
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1360
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1154
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1376
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1164
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1376
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1164
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1392
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1174
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1392
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1174
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1408
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1184
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1408
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1184
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1424
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1194
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1424
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1194
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1440
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x11a4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1440
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1456
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x11b4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1456
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1472
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x11c4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1472
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1488
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x11d4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1488
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1504
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x11e4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1504
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1520
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x11f4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1520
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11f4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1536
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1204
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1536
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1204
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1552
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1214
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1552
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1214
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1568
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1224
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1568
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1224
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1584
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1234
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1584
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1234
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1600
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1244
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1600
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1244
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1616
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1254
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1616
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1254
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1632
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1264
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1632
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1264
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1648
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1274
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1648
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1274
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1664
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1284
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1664
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1284
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1680
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1294
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1680
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1294
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1696
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x12a4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1696
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1712
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x12b4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1712
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1728
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x12c4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1728
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1744
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x12d4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1744
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1760
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x12e4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1760
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1776
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x12f4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1776
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12f4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1792
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1304
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1792
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1304
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1808
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1314
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1808
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1314
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1824
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1324
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1824
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1324
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1840
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1334
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1840
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1334
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1856
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1344
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1856
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1344
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1872
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1354
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1872
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1354
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1888
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1364
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1888
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1364
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1904
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1374
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1904
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1374
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1920
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1384
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1920
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1384
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1394
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1936
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1394
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x13a4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1952
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x13b4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1968
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x13c4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1984
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x13d4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2000
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2016
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x13e4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2016
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2032
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2032
; GFX10-FLATSCR-NEXT: ;;#ASMSTART
; GFX10-FLATSCR-NEXT: ;;#ASMEND
-; GFX10-FLATSCR-NEXT: v_add_co_u32 v4, s2, s0, v5
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v6, null, s1, 0, s2
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x13e4
+; GFX10-FLATSCR-NEXT: v_add_co_u32 v4, s0, s4, v5
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v6, null, s5, 0, s0
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13e4
; GFX10-FLATSCR-NEXT: ;;#ASMSTART
; GFX10-FLATSCR-NEXT: ;;#ASMEND
; GFX10-FLATSCR-NEXT: ;;#ASMSTART
@@ -8847,520 +8847,520 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: ;;#ASMSTART
; GFX10-FLATSCR-NEXT: ;;#ASMEND
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2032
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x13d4
-; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2016
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x13c4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2032
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2000
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x13b4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2016
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1984
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x13a4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2000
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1968
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1394
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1984
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1952
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1384
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1968
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1394
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1936
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1374
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1952
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1384
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1920
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1364
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1936
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1374
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1904
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1354
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1920
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1364
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1888
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1344
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1904
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1354
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1872
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1334
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1888
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1344
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1856
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1324
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1872
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1334
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1840
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1314
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1856
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1324
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1824
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1304
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1840
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1314
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1808
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x12f4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1824
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1304
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1792
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x12e4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1808
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12f4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1776
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x12d4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1792
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1760
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x12c4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1776
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1744
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x12b4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1760
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1728
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x12a4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1744
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1712
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1294
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1728
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1696
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1284
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1712
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1294
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1680
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1274
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1696
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1284
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1664
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1264
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1680
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1274
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1648
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1254
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1664
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1264
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1632
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1244
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1648
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1254
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1616
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1234
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1632
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1244
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1600
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1224
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1616
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1234
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1584
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1214
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1600
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1224
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1568
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1204
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1584
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1214
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1552
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x11f4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1568
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1204
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1536
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x11e4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1552
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11f4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1520
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x11d4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1536
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1504
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x11c4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1520
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1488
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x11b4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1504
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1472
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x11a4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1488
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1456
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1194
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1472
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1440
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1184
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1456
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1194
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1424
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1174
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1440
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1184
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1408
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1164
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1424
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1174
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1392
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1154
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1408
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1164
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1376
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1144
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1392
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1154
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1360
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1134
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1376
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1144
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1344
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1124
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1360
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1134
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1328
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1114
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1344
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1124
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1312
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1104
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1328
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1114
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1296
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x10f4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1312
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1104
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1280
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x10e4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1296
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10f4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1264
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x10d4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1280
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1248
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x10c4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1264
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1232
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x10b4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1248
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1216
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x10a4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1232
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1200
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1094
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1216
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1184
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1084
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1200
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1094
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1168
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1074
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1184
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1084
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1152
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1064
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1168
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1074
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1136
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1054
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1152
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1064
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1120
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1044
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1136
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1054
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1104
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1034
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1120
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1044
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1088
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1024
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1104
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1034
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1072
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1014
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1088
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1024
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1056
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1004
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1072
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1014
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1040
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xff4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1056
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1004
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1024
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xfe4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1040
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xff4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1008
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xfd4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1024
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfe4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:992
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xfc4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1008
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfd4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:976
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xfb4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:992
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfc4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:960
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xfa4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:976
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:944
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf94
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:960
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfa4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:928
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf84
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:944
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:912
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf74
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:928
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf84
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:896
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf64
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:912
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:880
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf54
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:896
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:864
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf44
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:880
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:848
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf34
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:864
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:832
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf24
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:848
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:816
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf14
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:832
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:800
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf04
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:816
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:784
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xef4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:800
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf04
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:768
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xee4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:784
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xef4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:752
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xed4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:768
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xee4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:736
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xec4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:752
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xed4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:720
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xeb4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:736
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xec4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:704
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xea4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:720
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xeb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:688
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe94
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:704
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xea4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:672
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe84
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:688
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:656
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe74
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:672
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe84
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:640
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe64
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:656
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:624
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe54
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:640
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:608
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe44
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:624
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:592
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe34
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:608
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:576
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe24
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:592
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:560
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe14
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:576
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:544
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe04
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:560
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:528
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xdf4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:544
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe04
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:512
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xde4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:528
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xdf4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:496
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xdd4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:512
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xde4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:480
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xdc4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:496
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xdd4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:464
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xdb4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:480
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xdc4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:448
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xda4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:464
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xdb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:432
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd94
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:448
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xda4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:416
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd84
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:432
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:400
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd74
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:416
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd84
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:384
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd64
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:400
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:368
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd54
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:384
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:352
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd44
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:368
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:336
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd34
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:352
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:320
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd24
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:336
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:304
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd14
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:320
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:288
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd04
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:304
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:272
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xcf4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:288
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd04
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:256
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xce4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:272
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xcf4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:240
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xcd4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:256
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xce4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:224
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xcc4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:240
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xcd4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:208
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xcb4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:224
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xcc4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:192
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xca4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:208
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xcb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:176
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc94
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:192
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xca4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:160
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc84
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:176
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:144
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc74
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:160
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc84
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:128
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc64
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:144
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:112
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc54
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:128
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:96
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc44
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:112
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:80
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc34
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:96
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:64
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc24
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:80
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:48
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc14
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:64
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:32
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc04
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:48
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:16
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:32
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc04
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:16
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbf4
-; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x480, v4
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbe4
+; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5]
+; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x480, v4
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v6, vcc_lo
; GFX10-FLATSCR-NEXT: v_add_co_u32 v2, vcc_lo, 0x780, v0
-; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbe4
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[2:3], v[7:10], off offset:2032
diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index b8cf692..64277e8 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -27,21 +27,21 @@ define amdgpu_kernel void @ashr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: ashr_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i32_e32 v1, v3, v1
; VI-NEXT: v_ashrrev_i32_e32 v0, v2, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: ashr_v2i32:
@@ -94,24 +94,24 @@ define amdgpu_kernel void @ashr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: ashr_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i32_e32 v3, v7, v3
; VI-NEXT: v_ashrrev_i32_e32 v2, v6, v2
; VI-NEXT: v_ashrrev_i32_e32 v1, v5, v1
; VI-NEXT: v_ashrrev_i32_e32 v0, v4, v0
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: ashr_v4i32:
@@ -175,31 +175,31 @@ define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: ashr_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_readfirstlane_b32 s0, v1
-; VI-NEXT: v_readfirstlane_b32 s1, v0
-; VI-NEXT: s_ashr_i32 s2, s1, 16
-; VI-NEXT: s_sext_i32_i16 s1, s1
-; VI-NEXT: s_ashr_i32 s3, s0, 16
-; VI-NEXT: s_sext_i32_i16 s0, s0
-; VI-NEXT: s_ashr_i32 s0, s1, s0
-; VI-NEXT: s_ashr_i32 s1, s2, s3
-; VI-NEXT: s_lshl_b32 s1, s1, 16
-; VI-NEXT: s_and_b32 s0, s0, 0xffff
-; VI-NEXT: s_or_b32 s0, s0, s1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: v_readfirstlane_b32 s4, v1
+; VI-NEXT: v_readfirstlane_b32 s5, v0
+; VI-NEXT: s_ashr_i32 s6, s5, 16
+; VI-NEXT: s_sext_i32_i16 s5, s5
+; VI-NEXT: s_ashr_i32 s7, s4, 16
+; VI-NEXT: s_sext_i32_i16 s4, s4
+; VI-NEXT: s_ashr_i32 s4, s5, s4
+; VI-NEXT: s_ashr_i32 s5, s6, s7
+; VI-NEXT: s_lshl_b32 s5, s5, 16
+; VI-NEXT: s_and_b32 s4, s4, 0xffff
+; VI-NEXT: s_or_b32 s4, s4, s5
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: ashr_v2i16:
@@ -282,43 +282,43 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: ashr_v4i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_readfirstlane_b32 s0, v2
-; VI-NEXT: v_readfirstlane_b32 s1, v3
-; VI-NEXT: v_readfirstlane_b32 s2, v0
-; VI-NEXT: v_readfirstlane_b32 s3, v1
-; VI-NEXT: s_ashr_i32 s8, s3, 16
-; VI-NEXT: s_sext_i32_i16 s3, s3
-; VI-NEXT: s_ashr_i32 s9, s2, 16
-; VI-NEXT: s_sext_i32_i16 s2, s2
-; VI-NEXT: s_ashr_i32 s10, s1, 16
-; VI-NEXT: s_sext_i32_i16 s1, s1
-; VI-NEXT: s_ashr_i32 s11, s0, 16
-; VI-NEXT: s_sext_i32_i16 s0, s0
-; VI-NEXT: s_ashr_i32 s0, s2, s0
-; VI-NEXT: s_ashr_i32 s2, s9, s11
-; VI-NEXT: s_ashr_i32 s1, s3, s1
-; VI-NEXT: s_ashr_i32 s3, s8, s10
-; VI-NEXT: s_lshl_b32 s3, s3, 16
-; VI-NEXT: s_and_b32 s1, s1, 0xffff
-; VI-NEXT: s_lshl_b32 s2, s2, 16
-; VI-NEXT: s_and_b32 s0, s0, 0xffff
-; VI-NEXT: s_or_b32 s1, s1, s3
-; VI-NEXT: s_or_b32 s0, s0, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: v_readfirstlane_b32 s4, v2
+; VI-NEXT: v_readfirstlane_b32 s5, v3
+; VI-NEXT: v_readfirstlane_b32 s6, v0
+; VI-NEXT: v_readfirstlane_b32 s7, v1
+; VI-NEXT: s_ashr_i32 s8, s7, 16
+; VI-NEXT: s_sext_i32_i16 s7, s7
+; VI-NEXT: s_ashr_i32 s9, s6, 16
+; VI-NEXT: s_sext_i32_i16 s6, s6
+; VI-NEXT: s_ashr_i32 s10, s5, 16
+; VI-NEXT: s_sext_i32_i16 s5, s5
+; VI-NEXT: s_ashr_i32 s11, s4, 16
+; VI-NEXT: s_sext_i32_i16 s4, s4
+; VI-NEXT: s_ashr_i32 s4, s6, s4
+; VI-NEXT: s_ashr_i32 s6, s9, s11
+; VI-NEXT: s_ashr_i32 s5, s7, s5
+; VI-NEXT: s_ashr_i32 s7, s8, s10
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: s_and_b32 s5, s5, 0xffff
+; VI-NEXT: s_lshl_b32 s6, s6, 16
+; VI-NEXT: s_and_b32 s4, s4, 0xffff
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: s_or_b32 s4, s4, s6
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: ashr_v4i16:
@@ -409,16 +409,16 @@ define amdgpu_kernel void @s_ashr_i64(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: s_ashr_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s5, s4, 31
-; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 8
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_ashr_i32 s3, s2, 31
+; VI-NEXT: s_ashr_i64 s[0:1], s[2:3], 8
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_ashr_i64:
@@ -461,20 +461,20 @@ define amdgpu_kernel void @ashr_i64_2(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: ashr_i64_2:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i64 v[0:1], v2, v[0:1]
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: ashr_i64_2:
@@ -533,22 +533,22 @@ define amdgpu_kernel void @ashr_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: ashr_v2i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3]
; VI-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1]
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: ashr_v2i64:
@@ -730,18 +730,18 @@ define amdgpu_kernel void @s_ashr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %
;
; VI-LABEL: s_ashr_32_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s6, s[0:1], 0x50
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s8, s[0:1], 0x50
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x74
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s7, s6, 31
-; VI-NEXT: s_add_u32 s4, s6, s4
-; VI-NEXT: s_addc_u32 s5, s7, s5
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_ashr_i32 s1, s8, 31
+; VI-NEXT: s_add_u32 s0, s8, s2
+; VI-NEXT: s_addc_u32 s1, s1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_ashr_32_i64:
@@ -785,17 +785,17 @@ define amdgpu_kernel void @v_ashr_32_i64(ptr addrspace(1) %out, ptr addrspace(1)
;
; VI-LABEL: v_ashr_32_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v0, s7
+; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
@@ -849,18 +849,18 @@ define amdgpu_kernel void @s_ashr_63_i64(ptr addrspace(1) %out, [8 x i32], i64 %
;
; VI-LABEL: s_ashr_63_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s6, s[0:1], 0x50
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s8, s[0:1], 0x50
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x74
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s6, s6, 31
-; VI-NEXT: s_add_u32 s4, s6, s4
-; VI-NEXT: s_addc_u32 s5, s6, s5
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_ashr_i32 s1, s8, 31
+; VI-NEXT: s_add_u32 s0, s1, s2
+; VI-NEXT: s_addc_u32 s1, s1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_ashr_63_i64:
@@ -905,17 +905,17 @@ define amdgpu_kernel void @v_ashr_63_i64(ptr addrspace(1) %out, ptr addrspace(1)
;
; VI-LABEL: v_ashr_63_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v0, s7
+; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v3
diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll
index 418c160..d33723c 100644
--- a/llvm/test/CodeGen/AMDGPU/srl.ll
+++ b/llvm/test/CodeGen/AMDGPU/srl.ll
@@ -26,15 +26,15 @@ define amdgpu_kernel void @lshr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
;
; VI-LABEL: lshr_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s4, s4, s5
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_lshr_b32 s0, s0, s1
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: lshr_i32:
@@ -83,17 +83,17 @@ define amdgpu_kernel void @lshr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: lshr_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s5, s5, s7
-; VI-NEXT: s_lshr_b32 s4, s4, s6
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshr_b32 s1, s1, s3
+; VI-NEXT: s_lshr_b32 s0, s0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: lshr_v2i32:
@@ -212,16 +212,16 @@ define amdgpu_kernel void @lshr_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
;
; VI-LABEL: lshr_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: lshr_i64:
@@ -382,14 +382,14 @@ define amdgpu_kernel void @s_lshr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %
;
; VI-LABEL: s_lshr_32_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x50
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x50
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_lshr_32_i64:
@@ -428,17 +428,17 @@ define amdgpu_kernel void @v_lshr_32_i64(ptr addrspace(1) %out, ptr addrspace(1)
;
; VI-LABEL: v_lshr_32_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v0, s7
+; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index 45aa544..6175d49 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -72,13 +72,13 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) {
;
; GFX8-LABEL: s_sub_imm_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sub_i32 s2, 0x4d2, s2
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_sub_i32 s0, 0x4d2, s4
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -130,13 +130,13 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX8-LABEL: test_sub_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1
; GFX8-NEXT: flat_store_dword v[2:3], v0
@@ -144,24 +144,24 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX9-LABEL: test_sub_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: test_sub_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_nc_u32_e32 v0, v0, v1
-; GFX12-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -194,13 +194,13 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace
;
; GFX8-LABEL: test_sub_imm_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 0x7b, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -208,24 +208,24 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace
;
; GFX9-LABEL: test_sub_imm_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v1, 0x7b, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: test_sub_imm_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0x7b, v1
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -257,13 +257,13 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX8-LABEL: test_sub_v2i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v3
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
@@ -272,26 +272,26 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX9-LABEL: test_sub_v2i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v1, v1, v3
; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: test_sub_v2i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b128 v[0:3], v4, s[2:3]
+; GFX12-NEXT: global_load_b128 v[0:3], v4, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_nc_u32_e32 v1, v1, v3
; GFX12-NEXT: v_sub_nc_u32_e32 v0, v0, v2
-; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v4, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -328,18 +328,18 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX8-LABEL: test_sub_v4i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s2, 16
-; GFX8-NEXT: s_addc_u32 s3, s3, 0
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: s_add_u32 s0, s6, 16
+; GFX8-NEXT: s_addc_u32 s1, s7, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v8, s0
-; GFX8-NEXT: v_mov_b32_e32 v9, s1
+; GFX8-NEXT: v_mov_b32_e32 v8, s4
+; GFX8-NEXT: v_mov_b32_e32 v9, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v7
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
@@ -350,33 +350,33 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX9-LABEL: test_sub_v4i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3]
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] offset:16
+; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v3, v7, v3
; GFX9-NEXT: v_sub_u32_e32 v2, v6, v2
; GFX9-NEXT: v_sub_u32_e32 v1, v5, v1
; GFX9-NEXT: v_sub_u32_e32 v0, v4, v0
-; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: test_sub_v4i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_load_b128 v[0:3], v8, s[2:3] offset:16
-; GFX12-NEXT: global_load_b128 v[4:7], v8, s[2:3]
+; GFX12-NEXT: global_load_b128 v[0:3], v8, s[6:7] offset:16
+; GFX12-NEXT: global_load_b128 v[4:7], v8, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_nc_u32_e32 v3, v7, v3
; GFX12-NEXT: v_sub_nc_u32_e32 v2, v6, v2
; GFX12-NEXT: v_sub_nc_u32_e32 v1, v5, v1
; GFX12-NEXT: v_sub_nc_u32_e32 v0, v4, v0
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -412,11 +412,11 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX8-LABEL: test_sub_i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -424,38 +424,38 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_load_ushort v2, v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_sub_u16_e32 v2, v4, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: test_sub_i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc
+; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_sub_u16_e32 v1, v1, v2
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: test_sub_i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] offset:2 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u16 v0, v0, s[6:7] offset:2 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_nc_u16 v0, v1, v0
-; GFX12-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX12-NEXT: global_store_b16 v2, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -497,15 +497,15 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX8-LABEL: test_sub_v2i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_sub_u16_e32 v4, v0, v1
; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -515,26 +515,26 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX9-LABEL: test_sub_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: test_sub_v2i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3]
+; GFX12-NEXT: global_load_b64 v[0:1], v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_sub_i16 v0, v0, v1
-; GFX12-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -583,15 +583,15 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX8-LABEL: test_sub_v4i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_sub_u16_e32 v6, v1, v3
; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -604,28 +604,28 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX9-LABEL: test_sub_v4i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v1, v1, v3
; GFX9-NEXT: v_pk_sub_i16 v0, v0, v2
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: test_sub_v4i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3]
+; GFX12-NEXT: global_load_b128 v[0:3], v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_sub_i16 v1, v1, v3
; GFX12-NEXT: v_pk_sub_i16 v0, v0, v2
-; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v4, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -657,14 +657,14 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64
; GFX8-LABEL: s_sub_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sub_u32 s2, s4, s6
-; GFX8-NEXT: s_subb_u32 s3, s5, s7
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: s_sub_u32 s0, s4, s6
+; GFX8-NEXT: s_subb_u32 s1, s5, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
@@ -685,12 +685,12 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_sub_nc_u64 s[2:3], s[4:5], s[6:7]
+; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[4:5], s[6:7]
; GFX12-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -726,14 +726,14 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac
; GFX8-LABEL: v_sub_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -764,13 +764,13 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7]
-; GFX12-NEXT: global_load_b64 v[2:3], v2, s[0:1]
+; GFX12-NEXT: global_load_b64 v[2:3], v2, s[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
@@ -817,14 +817,14 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace
; GFX8-LABEL: v_test_sub_v2i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -859,13 +859,13 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_load_b128 v[0:3], v4, s[6:7]
-; GFX12-NEXT: global_load_b128 v[4:7], v4, s[0:1]
+; GFX12-NEXT: global_load_b128 v[4:7], v4, s[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v6
; GFX12-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
@@ -922,14 +922,14 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace
; GFX8-LABEL: v_test_sub_v4i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 5, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v8, vcc, s6, v0
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v12, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[8:9]
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[12:13]
@@ -988,15 +988,15 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_lshlrev_b32_e32 v12, 5, v0
; GFX12-NEXT: v_mov_b32_e32 v16, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_load_b128 v[0:3], v12, s[6:7]
-; GFX12-NEXT: global_load_b128 v[4:7], v12, s[0:1]
+; GFX12-NEXT: global_load_b128 v[4:7], v12, s[2:3]
; GFX12-NEXT: global_load_b128 v[8:11], v12, s[6:7] offset:16
-; GFX12-NEXT: global_load_b128 v[12:15], v12, s[0:1] offset:16
+; GFX12-NEXT: global_load_b128 v[12:15], v12, s[2:3] offset:16
; GFX12-NEXT: s_wait_loadcnt 0x2
; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v6
; GFX12-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index 6c53afe..c1e7256 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -25,14 +25,14 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace
; VI-LABEL: v_test_sub_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -68,12 +68,12 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
@@ -114,12 +114,12 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace
; VI-LABEL: s_test_sub_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s6, s[6:7], 0x0
-; VI-NEXT: s_load_dword s7, s[0:1], 0x0
+; VI-NEXT: s_load_dword s7, s[8:9], 0x0
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
@@ -153,14 +153,14 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_sub_i16 v0, s2, s0
+; GFX11-NEXT: v_pk_sub_i16 v0, s0, s1
; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -175,32 +175,32 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace
define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 {
; GCN-LABEL: s_test_sub_self_v2i16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_sub_self_v2i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_sub_self_v2i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -277,62 +277,62 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x
define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_constant:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b
+; GFX9-NEXT: s_mov_b32 s0, 0x1c8007b
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_constant:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, 0xfffffe38
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_add_u16_e32 v2, 0xff85, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v2, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_sub_v2i16_constant:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0x1c8007b
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_sub_v2i16_constant:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0x1c8007b
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -349,62 +349,62 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr
define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_neg_constant:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3
+; GFX9-NEXT: s_mov_b32 s0, 0xfc21fcb3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_neg_constant:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, 0x3df
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_add_u16_e32 v2, 0x34d, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v2, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_sub_v2i16_neg_constant:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0xfc21fcb3
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_sub_v2i16_neg_constant:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0xfc21fcb3
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -420,61 +420,61 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out,
define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_inline_neg1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_inline_neg1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, 1
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_add_u16_e32 v2, 1, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v2, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_sub_v2i16_inline_neg1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: v_pk_sub_i16 v0, v0, -1
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_sub_v2i16_inline_neg1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: v_pk_sub_i16 v0, v0, -1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -490,60 +490,60 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p
define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; VI-NEXT: v_subrev_u16_e32 v0, 32, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: v_pk_sub_i16 v0, v0, 32
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: v_pk_sub_i16 v0, v0, 32
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -560,60 +560,60 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %
define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_pk_sub_i16 v0, v0, 1.0
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_inline_fp_split:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, 0xffffc080
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_sub_v2i16_inline_fp_split:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: v_pk_sub_i16 v0, v0, 1.0
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_sub_v2i16_inline_fp_split:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: v_pk_sub_i16 v0, v0, 1.0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -649,14 +649,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out,
; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v1, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -693,12 +693,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
@@ -747,14 +747,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -795,12 +795,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
@@ -848,14 +848,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out,
; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -894,12 +894,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
@@ -948,14 +948,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out,
; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
@@ -998,12 +998,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index 7dce633..8486fba 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -12,11 +12,11 @@ declare void @llvm.debugtrap() #1
define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
; NOHSA-TRAP-GFX900-LABEL: trap:
; NOHSA-TRAP-GFX900: ; %bb.0:
-; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1]
+; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[2:3]
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; NOHSA-TRAP-GFX900-NEXT: s_endpgm
;
@@ -103,16 +103,16 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
; NOHSA-TRAP-GFX900-LABEL: non_entry_trap:
; NOHSA-TRAP-GFX900: ; %bb.0: ; %entry
-; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc
+; NOHSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[2:3] glc
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; NOHSA-TRAP-GFX900-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1
; NOHSA-TRAP-GFX900-NEXT: s_cbranch_vccz .LBB1_2
; NOHSA-TRAP-GFX900-NEXT: ; %bb.1: ; %ret
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 3
-; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1]
+; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[2:3]
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; NOHSA-TRAP-GFX900-NEXT: s_endpgm
; NOHSA-TRAP-GFX900-NEXT: .LBB1_2: ; %trap
@@ -267,14 +267,14 @@ ret:
define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
; NOHSA-TRAP-GFX900-LABEL: trap_with_use_after:
; NOHSA-TRAP-GFX900: ; %bb.0:
-; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc
+; NOHSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[4:5] glc
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; NOHSA-TRAP-GFX900-NEXT: s_cbranch_execnz .LBB2_2
; NOHSA-TRAP-GFX900-NEXT: ; %bb.1:
-; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[2:3]
+; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[6:7]
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; NOHSA-TRAP-GFX900-NEXT: .LBB2_2:
; NOHSA-TRAP-GFX900-NEXT: s_endpgm
@@ -403,14 +403,14 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs
define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) {
; NOHSA-TRAP-GFX900-LABEL: debugtrap:
; NOHSA-TRAP-GFX900: ; %bb.0:
-; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v2, 2
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1]
+; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[2:3]
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v2, s[0:1]
+; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v2, s[2:3]
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; NOHSA-TRAP-GFX900-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
index c0c56eb..b6056f6 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
@@ -104,15 +104,15 @@ define amdgpu_kernel void @truncate_high_elt_extract_vector(ptr addrspace(1) noc
; VI-LABEL: truncate_high_elt_extract_vector:
; VI: ; %bb.0: ; %bb
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[4:5], 0x0
-; VI-NEXT: s_load_dword s3, s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_load_dword s0, s[4:5], 0x0
+; VI-NEXT: s_load_dword s1, s[6:7], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_sext_i32_i16 s0, s2
-; VI-NEXT: s_sext_i32_i16 s1, s3
+; VI-NEXT: s_sext_i32_i16 s0, s0
+; VI-NEXT: s_sext_i32_i16 s1, s1
; VI-NEXT: s_mul_i32 s1, s1, s0
; VI-NEXT: s_lshr_b32 s0, s1, 16
; VI-NEXT: v_mov_b32_e32 v2, s0
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
index e668c1d..d9e0e02 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
@@ -11,11 +11,11 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY killed $sgpr1
; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:sreg_64 = COPY killed [[COPY]]
; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:sreg_64 = COPY killed [[COPY1]]
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (invariant load (<2 x s32>) from %ir.ptr, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %11:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY2]], 0, 0 :: (invariant load (<2 x s32>) from %ir.ptr, align 4, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[COPY2]], 8, 0 :: (invariant load (s32) from %ir.ptr + 8, addrspace 4)
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %11.sub0
; CHECK-NEXT: $sgpr0 = COPY killed [[COPY3]]
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY killed %11.sub1
; CHECK-NEXT: $sgpr1 = COPY killed [[COPY4]]
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]]
; CHECK-NEXT: $sgpr2 = COPY killed [[COPY5]]
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index 416dbb2..eb45776 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -28,12 +28,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; VI-LABEL: s_uaddo_i64_zext:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s6, s0
+; VI-NEXT: s_add_u32 s0, s6, s2
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s1, s7, s1
+; VI-NEXT: s_addc_u32 s1, s7, s3
; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -96,12 +96,12 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; VI-LABEL: s_uaddo_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -161,18 +161,18 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_uaddo_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: flat_load_dword v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: flat_load_dword v5, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -182,16 +182,16 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_uaddo_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT: global_store_byte v0, v2, s[6:7]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -243,18 +243,18 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
;
; VI-LABEL: v_uaddo_i32_novcc:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: flat_load_dword v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: flat_load_dword v5, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -268,19 +268,19 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
;
; GFX9-LABEL: v_uaddo_i32_novcc:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX9-NEXT: global_store_byte v0, v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -325,19 +325,19 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: s_uaddo_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_u32 s0, s4, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_addc_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_add_u32 s0, s8, s10
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: s_addc_u32 s1, s9, s11
+; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v6, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: flat_store_byte v[2:3], v0
@@ -345,19 +345,19 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: s_uaddo_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s6, s4, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_addc_u32 s7, s5, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: s_add_u32 s0, s8, s10
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s9, s11
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
-; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX9-NEXT: global_store_byte v4, v0, s[6:7]
; GFX9-NEXT: s_endpgm
%uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %uadd, 0
@@ -401,18 +401,18 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_uaddo_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s2
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v6, s6
+; VI-NEXT: v_mov_b32_e32 v7, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
@@ -424,18 +424,18 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_uaddo_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
+; GFX9-NEXT: global_store_byte v4, v0, s[6:7]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -486,18 +486,18 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_uaddo_i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: flat_load_ushort v4, v[0:1]
+; VI-NEXT: flat_load_ushort v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: flat_load_ushort v4, v[0:1]
-; VI-NEXT: flat_load_ushort v5, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u16_e32 v5, v4, v5
; VI-NEXT: v_cmp_lt_u16_e32 vcc, v5, v4
@@ -508,17 +508,17 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_uaddo_i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[4:5]
-; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
+; GFX9-NEXT: global_load_ushort v1, v0, s[8:9]
+; GFX9-NEXT: global_load_ushort v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u16_e32 v2, v1, v2
; GFX9-NEXT: v_cmp_lt_u16_e32 vcc, v2, v1
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: global_store_short v0, v2, s[0:1]
-; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT: global_store_short v0, v2, s[4:5]
+; GFX9-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -568,18 +568,18 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; VI-LABEL: v_uaddo_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s2
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v6, s6
+; VI-NEXT: v_mov_b32_e32 v7, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3
; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
@@ -591,18 +591,18 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX9-LABEL: v_uaddo_v2i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7]
; GFX9-NEXT: s_endpgm
%a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
%b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index c7952f5..8e75127 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -44,17 +44,17 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
;
; VI-LABEL: udiv_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_u32_e32 v2, v1
; VI-NEXT: v_sub_u32_e32 v3, vcc, 0, v1
@@ -75,7 +75,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v2
; VI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: udiv_i32:
@@ -401,17 +401,17 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: udiv_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_mov_b32 s2, s6
+; VI-NEXT: s_mov_b32 s3, s7
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
-; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s10
+; VI-NEXT: s_mov_b32 s1, s11
+; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s4, s8
+; VI-NEXT: s_mov_b32 s5, s9
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_u32_e32 v4, v2
; VI-NEXT: v_cvt_f32_u32_e32 v5, v3
@@ -714,18 +714,18 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: udiv_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s2, s10
+; VI-NEXT: s_mov_b32 s3, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
-; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s8, s4
+; VI-NEXT: s_mov_b32 s9, s5
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cvt_f32_u32_e32 v8, v0
; VI-NEXT: v_cvt_f32_u32_e32 v10, v1
@@ -1116,20 +1116,20 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac
;
; VI-LABEL: udiv_i32_div_pow2:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v0, 4, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: udiv_i32_div_pow2:
@@ -1203,22 +1203,22 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp
;
; VI-LABEL: udiv_i32_div_k_even:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s2, 0xfabbd9c1
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, 0xfabbd9c1
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mul_hi_u32 v0, v0, s2
+; VI-NEXT: v_mul_hi_u32 v0, v0, s0
+; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: v_lshrrev_b32_e32 v0, 25, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: udiv_i32_div_k_even:
@@ -1297,22 +1297,22 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa
;
; VI-LABEL: udiv_i32_div_k_odd:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s2, 0x7d5deca3
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, 0x7d5deca3
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mul_hi_u32 v0, v0, s2
+; VI-NEXT: v_mul_hi_u32 v0, v0, s0
+; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: v_lshrrev_b32_e32 v0, 24, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: udiv_i32_div_k_odd:
@@ -1400,18 +1400,18 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; VI-LABEL: v_udiv_i8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1
; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0
@@ -1424,7 +1424,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
; VI-NEXT: v_and_b32_e32 v0, 0xff, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: v_udiv_i8:
@@ -1540,18 +1540,18 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: v_udiv_i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1564,7 +1564,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: v_udiv_i16:
@@ -1688,20 +1688,20 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: v_udiv_i23:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4
; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2
; VI-NEXT: buffer_load_ushort v3, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: s_waitcnt vmcnt(2)
@@ -1720,7 +1720,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
; VI-NEXT: v_and_b32_e32 v0, 0x7fffff, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: v_udiv_i23:
@@ -1885,20 +1885,20 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: v_udiv_i24:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4
; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2
; VI-NEXT: buffer_load_ushort v3, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: s_waitcnt vmcnt(2)
@@ -1917,7 +1917,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
; VI-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: v_udiv_i24:
@@ -2076,30 +2076,30 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
;
; VI-LABEL: scalarize_mulhu_4xi32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; VI-NEXT: s_mov_b32 s0, 0x1389c755
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s4, 0x1389c755
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v0, 2, v0
; VI-NEXT: v_lshrrev_b32_e32 v1, 2, v1
; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2
; VI-NEXT: v_lshrrev_b32_e32 v3, 2, v3
-; VI-NEXT: v_mul_hi_u32 v0, v0, s0
-; VI-NEXT: v_mul_hi_u32 v1, v1, s0
-; VI-NEXT: v_mul_hi_u32 v2, v2, s0
-; VI-NEXT: v_mul_hi_u32 v3, v3, s0
+; VI-NEXT: v_mul_hi_u32 v0, v0, s4
+; VI-NEXT: v_mul_hi_u32 v1, v1, s4
+; VI-NEXT: v_mul_hi_u32 v2, v2, s4
+; VI-NEXT: v_mul_hi_u32 v3, v3, s4
; VI-NEXT: v_lshrrev_b32_e32 v0, 10, v0
; VI-NEXT: v_lshrrev_b32_e32 v1, 10, v1
; VI-NEXT: v_lshrrev_b32_e32 v2, 10, v2
; VI-NEXT: v_lshrrev_b32_e32 v3, 10, v3
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: scalarize_mulhu_4xi32:
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
index f0f0b66..0bb2127 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -77,37 +77,37 @@ define amdgpu_kernel void @test_udivrem(ptr addrspace(1) %out0, [8 x i32], ptr a
;
; GFX8-LABEL: test_udivrem:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s4, s[0:1], 0x98
-; GFX8-NEXT: s_load_dword s5, s[0:1], 0x74
+; GFX8-NEXT: s_load_dword s6, s[0:1], 0x98
+; GFX8-NEXT: s_load_dword s7, s[0:1], 0x74
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s4
-; GFX8-NEXT: s_sub_i32 s2, 0, s4
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6
+; GFX8-NEXT: s_sub_i32 s2, 0, s6
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT: v_mul_hi_u32 v4, s5, v0
+; GFX8-NEXT: v_mul_hi_u32 v4, s7, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_readfirstlane_b32 s0, v4
-; GFX8-NEXT: s_mul_i32 s0, s0, s4
-; GFX8-NEXT: s_sub_i32 s0, s5, s0
-; GFX8-NEXT: s_sub_i32 s1, s0, s4
+; GFX8-NEXT: s_mul_i32 s0, s0, s6
+; GFX8-NEXT: s_sub_i32 s0, s7, s0
+; GFX8-NEXT: s_sub_i32 s1, s0, s6
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v4
-; GFX8-NEXT: s_cmp_ge_u32 s0, s4
+; GFX8-NEXT: s_cmp_ge_u32 s0, s6
; GFX8-NEXT: s_cselect_b64 vcc, -1, 0
; GFX8-NEXT: s_cselect_b32 s0, s1, s0
; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX8-NEXT: s_sub_i32 s1, s0, s4
+; GFX8-NEXT: s_sub_i32 s1, s0, s6
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v4
-; GFX8-NEXT: s_cmp_ge_u32 s0, s4
+; GFX8-NEXT: s_cmp_ge_u32 s0, s6
; GFX8-NEXT: s_cselect_b64 vcc, -1, 0
; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
; GFX8-NEXT: s_cselect_b32 s0, s1, s0
@@ -212,7 +212,6 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x,
; GFX8-LABEL: test_udivrem_v2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX8-NEXT: s_sub_i32 s2, 0, s6
@@ -227,7 +226,6 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x,
; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: s_mul_i32 s2, s2, s6
; GFX8-NEXT: s_sub_i32 s2, s4, s2
@@ -236,24 +234,27 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x,
; GFX8-NEXT: s_cselect_b32 s2, s3, s2
; GFX8-NEXT: s_sub_i32 s3, s2, s6
; GFX8-NEXT: s_cmp_ge_u32 s2, s6
-; GFX8-NEXT: s_cselect_b32 s2, s3, s2
-; GFX8-NEXT: s_sub_i32 s3, 0, s7
-; GFX8-NEXT: v_mul_lo_u32 v0, s3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_cselect_b32 s4, s3, s2
+; GFX8-NEXT: s_sub_i32 s2, 0, s7
+; GFX8-NEXT: v_mul_lo_u32 v0, s2, v1
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_mul_hi_u32 v1, s5, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: s_mul_i32 s2, s2, s7
-; GFX8-NEXT: s_sub_i32 s2, s5, s2
-; GFX8-NEXT: s_sub_i32 s3, s2, s7
-; GFX8-NEXT: s_cmp_ge_u32 s2, s7
-; GFX8-NEXT: s_cselect_b32 s2, s3, s2
-; GFX8-NEXT: s_sub_i32 s3, s2, s7
-; GFX8-NEXT: s_cmp_ge_u32 s2, s7
-; GFX8-NEXT: s_cselect_b32 s2, s3, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: s_mul_i32 s0, s0, s7
+; GFX8-NEXT: s_sub_i32 s0, s5, s0
+; GFX8-NEXT: s_sub_i32 s1, s0, s7
+; GFX8-NEXT: s_cmp_ge_u32 s0, s7
+; GFX8-NEXT: s_cselect_b32 s0, s1, s0
+; GFX8-NEXT: s_sub_i32 s1, s0, s7
+; GFX8-NEXT: s_cmp_ge_u32 s0, s7
+; GFX8-NEXT: s_cselect_b32 s0, s1, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
%result0 = udiv <2 x i32> %x, %y
@@ -419,14 +420,11 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x,
; GFX8-LABEL: test_udivrem_v4:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8
; GFX8-NEXT: s_sub_i32 s2, 0, s8
; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s9
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
@@ -445,9 +443,9 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x,
; GFX8-NEXT: s_cselect_b32 s2, s3, s2
; GFX8-NEXT: s_sub_i32 s3, s2, s8
; GFX8-NEXT: s_cmp_ge_u32 s2, s8
-; GFX8-NEXT: s_cselect_b32 s2, s3, s2
-; GFX8-NEXT: s_sub_i32 s3, 0, s9
-; GFX8-NEXT: v_mul_lo_u32 v0, s3, v1
+; GFX8-NEXT: s_cselect_b32 s4, s3, s2
+; GFX8-NEXT: s_sub_i32 s2, 0, s9
+; GFX8-NEXT: v_mul_lo_u32 v0, s2, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
@@ -455,50 +453,54 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x,
; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11
-; GFX8-NEXT: v_readfirstlane_b32 s3, v0
-; GFX8-NEXT: s_mul_i32 s3, s3, s9
-; GFX8-NEXT: s_sub_i32 s3, s5, s3
-; GFX8-NEXT: s_sub_i32 s4, s3, s9
-; GFX8-NEXT: s_cmp_ge_u32 s3, s9
-; GFX8-NEXT: s_cselect_b32 s3, s4, s3
-; GFX8-NEXT: s_sub_i32 s4, s3, s9
-; GFX8-NEXT: s_cmp_ge_u32 s3, s9
-; GFX8-NEXT: s_cselect_b32 s3, s4, s3
-; GFX8-NEXT: s_sub_i32 s4, 0, s10
-; GFX8-NEXT: v_mul_lo_u32 v0, s4, v1
+; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: s_mul_i32 s2, s2, s9
+; GFX8-NEXT: s_sub_i32 s2, s5, s2
+; GFX8-NEXT: s_sub_i32 s3, s2, s9
+; GFX8-NEXT: s_cmp_ge_u32 s2, s9
+; GFX8-NEXT: s_cselect_b32 s2, s3, s2
+; GFX8-NEXT: s_sub_i32 s3, s2, s9
+; GFX8-NEXT: s_cmp_ge_u32 s2, s9
+; GFX8-NEXT: s_cselect_b32 s5, s3, s2
+; GFX8-NEXT: s_sub_i32 s2, 0, s10
+; GFX8-NEXT: v_mul_lo_u32 v0, s2, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s6, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mul_i32 s4, s4, s10
-; GFX8-NEXT: s_sub_i32 s4, s6, s4
-; GFX8-NEXT: s_sub_i32 s5, s4, s10
-; GFX8-NEXT: s_cmp_ge_u32 s4, s10
-; GFX8-NEXT: s_cselect_b32 s4, s5, s4
-; GFX8-NEXT: s_sub_i32 s5, s4, s10
-; GFX8-NEXT: s_cmp_ge_u32 s4, s10
-; GFX8-NEXT: s_cselect_b32 s4, s5, s4
-; GFX8-NEXT: s_sub_i32 s5, 0, s11
-; GFX8-NEXT: v_mul_lo_u32 v0, s5, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: s_mul_i32 s2, s2, s10
+; GFX8-NEXT: s_sub_i32 s2, s6, s2
+; GFX8-NEXT: s_sub_i32 s3, s2, s10
+; GFX8-NEXT: s_cmp_ge_u32 s2, s10
+; GFX8-NEXT: s_cselect_b32 s2, s3, s2
+; GFX8-NEXT: s_sub_i32 s3, s2, s10
+; GFX8-NEXT: s_cmp_ge_u32 s2, s10
+; GFX8-NEXT: s_cselect_b32 s6, s3, s2
+; GFX8-NEXT: s_sub_i32 s2, 0, s11
+; GFX8-NEXT: v_mul_lo_u32 v0, s2, v1
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_mul_hi_u32 v3, s7, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_readfirstlane_b32 s2, v3
-; GFX8-NEXT: s_mul_i32 s2, s2, s11
-; GFX8-NEXT: s_sub_i32 s2, s7, s2
-; GFX8-NEXT: s_sub_i32 s3, s2, s11
-; GFX8-NEXT: s_cmp_ge_u32 s2, s11
-; GFX8-NEXT: s_cselect_b32 s2, s3, s2
-; GFX8-NEXT: s_sub_i32 s3, s2, s11
-; GFX8-NEXT: s_cmp_ge_u32 s2, s11
-; GFX8-NEXT: s_cselect_b32 s2, s3, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_readfirstlane_b32 s0, v3
+; GFX8-NEXT: s_mul_i32 s0, s0, s11
+; GFX8-NEXT: s_sub_i32 s0, s7, s0
+; GFX8-NEXT: s_sub_i32 s1, s0, s11
+; GFX8-NEXT: s_cmp_ge_u32 s0, s11
+; GFX8-NEXT: s_cselect_b32 s0, s1, s0
+; GFX8-NEXT: s_sub_i32 s1, s0, s11
+; GFX8-NEXT: s_cmp_ge_u32 s0, s11
+; GFX8-NEXT: s_cselect_b32 s0, s1, s0
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
%result0 = udiv <4 x i32> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
index 79b0a96..2a4066d 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
@@ -28,42 +28,42 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i
;
; GFX8-LABEL: s_uint_to_fp_i64_to_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_flbit_i32_b32 s4, s3
-; GFX8-NEXT: s_min_u32 s4, s4, 32
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX8-NEXT: s_sub_i32 s2, 32, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_ldexp_f32 v0, v0, s2
+; GFX8-NEXT: s_flbit_i32_b32 s0, s7
+; GFX8-NEXT: s_min_u32 s2, s0, 32
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_ldexp_f32 v0, v0, s0
; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX11-LABEL: s_uint_to_fp_i64_to_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clz_i32_u32 s4, s3
+; GFX11-NEXT: s_clz_i32_u32 s0, s7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s4, s4, 32
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
+; GFX11-NEXT: s_min_u32 s2, s0, 32
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s2, s2, 1
-; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_min_u32 s0, s0, 1
+; GFX11-NEXT: s_or_b32 s0, s1, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX11-NEXT: s_sub_i32 s2, 32, s4
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s0
+; GFX11-NEXT: s_sub_i32 s0, 32, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
+; GFX11-NEXT: v_ldexp_f32 v0, v0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -100,12 +100,12 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
;
; GFX8-LABEL: v_uint_to_fp_i64_to_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -116,8 +116,8 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
; GFX8-NEXT: v_min_u32_e32 v1, 1, v1
; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; GFX8-NEXT: v_ldexp_f32 v1, v1, v3
; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v1
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
@@ -126,11 +126,11 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
;
; GFX11-LABEL: v_uint_to_fp_i64_to_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3]
+; GFX11-NEXT: global_load_b64 v[1:2], v1, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_clz_i32_u32_e32 v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -145,7 +145,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
; GFX11-NEXT: v_ldexp_f32 v1, v1, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -180,39 +180,39 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i
;
; GFX8-LABEL: s_uint_to_fp_i64_to_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_flbit_i32_b32 s4, s3
-; GFX8-NEXT: s_min_u32 s4, s4, 32
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_sub_i32 s0, 32, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_flbit_i32_b32 s0, s7
+; GFX8-NEXT: s_min_u32 s2, s0, 32
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_ldexp_f32 v2, v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX11-LABEL: s_uint_to_fp_i64_to_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clz_i32_u32 s4, s3
+; GFX11-NEXT: s_clz_i32_u32 s0, s7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s4, s4, 32
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
+; GFX11-NEXT: s_min_u32 s2, s0, 32
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s2, s2, 1
-; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_min_u32 s0, s0, 1
+; GFX11-NEXT: s_or_b32 s0, s1, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX11-NEXT: s_sub_i32 s2, 32, s4
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s0
+; GFX11-NEXT: s_sub_i32 s0, 32, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_ldexp_f32 v0, v0, s0
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -248,23 +248,23 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
;
; GFX8-LABEL: v_uint_to_fp_i64_to_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_ffbh_u32_e32 v0, v2
; GFX8-NEXT: v_min_u32_e32 v4, 32, v0
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s5
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_cvt_f32_u32_e32 v5, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v3
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
; GFX8-NEXT: v_ldexp_f32 v2, v5, v2
@@ -273,11 +273,11 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
;
; GFX11-LABEL: v_uint_to_fp_i64_to_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3]
+; GFX11-NEXT: global_load_b64 v[1:2], v1, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_clz_i32_u32_e32 v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -290,7 +290,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11-NEXT: v_ldexp_f32 v1, v1, v2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -333,26 +333,26 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2
; GFX8-LABEL: s_uint_to_fp_v2i64_to_v2f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_flbit_i32_b32 s2, s7
-; GFX8-NEXT: s_flbit_i32_b32 s3, s5
-; GFX8-NEXT: s_min_u32 s8, s2, 32
-; GFX8-NEXT: s_min_u32 s9, s3, 32
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s8
-; GFX8-NEXT: s_min_u32 s2, s2, 1
+; GFX8-NEXT: s_flbit_i32_b32 s0, s7
+; GFX8-NEXT: s_flbit_i32_b32 s1, s5
+; GFX8-NEXT: s_min_u32 s8, s0, 32
+; GFX8-NEXT: s_min_u32 s9, s1, 32
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s8
+; GFX8-NEXT: s_min_u32 s0, s0, 1
; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX8-NEXT: s_min_u32 s2, s4, 1
-; GFX8-NEXT: s_or_b32 s2, s5, s2
-; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s2
-; GFX8-NEXT: s_sub_i32 s2, 32, s8
-; GFX8-NEXT: v_ldexp_f32 v1, v0, s2
-; GFX8-NEXT: s_sub_i32 s2, 32, s9
-; GFX8-NEXT: v_ldexp_f32 v0, v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s0
+; GFX8-NEXT: s_min_u32 s0, s4, 1
+; GFX8-NEXT: s_or_b32 s0, s5, s0
+; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s8
+; GFX8-NEXT: v_ldexp_f32 v1, v0, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s9
+; GFX8-NEXT: v_ldexp_f32 v0, v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -360,27 +360,27 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clz_i32_u32 s2, s7
-; GFX11-NEXT: s_clz_i32_u32 s3, s5
-; GFX11-NEXT: s_min_u32 s8, s2, 32
-; GFX11-NEXT: s_min_u32 s9, s3, 32
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[6:7], s8
+; GFX11-NEXT: s_clz_i32_u32 s0, s7
+; GFX11-NEXT: s_clz_i32_u32 s1, s5
+; GFX11-NEXT: s_min_u32 s8, s0, 32
+; GFX11-NEXT: s_min_u32 s9, s1, 32
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], s8
; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
-; GFX11-NEXT: s_min_u32 s2, s2, 1
+; GFX11-NEXT: s_min_u32 s0, s0, 1
; GFX11-NEXT: s_min_u32 s4, s4, 1
-; GFX11-NEXT: s_or_b32 s2, s3, s2
-; GFX11-NEXT: s_or_b32 s3, s5, s4
-; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s3
-; GFX11-NEXT: s_sub_i32 s2, 32, s8
-; GFX11-NEXT: s_sub_i32 s3, 32, s9
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s5, s4
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s0
+; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s1
+; GFX11-NEXT: s_sub_i32 s0, 32, s8
+; GFX11-NEXT: s_sub_i32 s1, 32, s9
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_ldexp_f32 v1, v0, s2
-; GFX11-NEXT: v_ldexp_f32 v0, v2, s3
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: v_ldexp_f32 v1, v0, s0
+; GFX11-NEXT: v_ldexp_f32 v0, v2, s1
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -443,19 +443,19 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: v_uint_to_fp_v4i64_to_v4f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6]
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6]
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0
-; GFX8-NEXT: v_mov_b32_e32 v10, s1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v0
+; GFX8-NEXT: v_mov_b32_e32 v10, s5
; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_ffbh_u32_e32 v0, v4
@@ -496,12 +496,12 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
;
; GFX11-LABEL: v_uint_to_fp_v4i64_to_v4f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16
-; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3]
+; GFX11-NEXT: global_load_b128 v[1:4], v5, s[6:7] offset:16
+; GFX11-NEXT: global_load_b128 v[5:8], v5, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_clz_i32_u32_e32 v9, v4
; GFX11-NEXT: v_clz_i32_u32_e32 v10, v2
@@ -540,7 +540,7 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
; GFX11-NEXT: v_ldexp_f32 v2, v1, v10
; GFX11-NEXT: v_ldexp_f32 v1, v6, v11
; GFX11-NEXT: v_ldexp_f32 v0, v4, v5
-; GFX11-NEXT: global_store_b128 v7, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v7, v[0:3], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -587,29 +587,29 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2
; GFX8-LABEL: s_uint_to_fp_v2i64_to_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_flbit_i32_b32 s2, s7
-; GFX8-NEXT: s_flbit_i32_b32 s3, s5
-; GFX8-NEXT: s_min_u32 s8, s2, 32
-; GFX8-NEXT: s_min_u32 s9, s3, 32
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s8
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s9
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s2
+; GFX8-NEXT: s_flbit_i32_b32 s0, s7
+; GFX8-NEXT: s_flbit_i32_b32 s1, s5
+; GFX8-NEXT: s_min_u32 s8, s0, 32
+; GFX8-NEXT: s_min_u32 s9, s1, 32
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s8
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s0
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[4:5], s9
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s0
; GFX8-NEXT: s_sub_i32 s6, 32, s8
-; GFX8-NEXT: s_sub_i32 s2, 32, s9
+; GFX8-NEXT: s_sub_i32 s0, 32, s9
; GFX8-NEXT: v_ldexp_f32 v0, v0, s6
-; GFX8-NEXT: v_ldexp_f32 v1, v1, s2
+; GFX8-NEXT: v_ldexp_f32 v1, v1, s0
; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-NEXT: v_or_b32_e32 v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -617,32 +617,32 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clz_i32_u32 s2, s7
-; GFX11-NEXT: s_clz_i32_u32 s3, s5
-; GFX11-NEXT: s_min_u32 s8, s2, 32
-; GFX11-NEXT: s_min_u32 s9, s3, 32
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[6:7], s8
+; GFX11-NEXT: s_clz_i32_u32 s0, s7
+; GFX11-NEXT: s_clz_i32_u32 s1, s5
+; GFX11-NEXT: s_min_u32 s8, s0, 32
+; GFX11-NEXT: s_min_u32 s9, s1, 32
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], s8
; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
-; GFX11-NEXT: s_min_u32 s2, s2, 1
+; GFX11-NEXT: s_min_u32 s0, s0, 1
; GFX11-NEXT: s_min_u32 s4, s4, 1
-; GFX11-NEXT: s_or_b32 s2, s3, s2
-; GFX11-NEXT: s_or_b32 s3, s5, s4
-; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX11-NEXT: v_cvt_f32_u32_e32 v1, s3
-; GFX11-NEXT: s_sub_i32 s2, 32, s8
-; GFX11-NEXT: s_sub_i32 s3, 32, s9
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s5, s4
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s0
+; GFX11-NEXT: v_cvt_f32_u32_e32 v1, s1
+; GFX11-NEXT: s_sub_i32 s0, 32, s8
+; GFX11-NEXT: s_sub_i32 s1, 32, s9
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
-; GFX11-NEXT: v_ldexp_f32 v1, v1, s3
+; GFX11-NEXT: v_ldexp_f32 v0, v0, s0
+; GFX11-NEXT: v_ldexp_f32 v1, v1, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -713,18 +713,18 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: v_uint_to_fp_v4i64_to_v4f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6]
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6]
-; GFX8-NEXT: v_mov_b32_e32 v10, s1
+; GFX8-NEXT: v_mov_b32_e32 v10, s5
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_ffbh_u32_e32 v0, v4
; GFX8-NEXT: v_ffbh_u32_e32 v11, v2
@@ -763,7 +763,7 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v0
; GFX8-NEXT: v_cvt_f16_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_cvt_f16_f32_e32 v6, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v9
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v9
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v10, vcc
; GFX8-NEXT: v_or_b32_e32 v2, v4, v3
; GFX8-NEXT: v_or_b32_e32 v3, v6, v5
@@ -772,12 +772,12 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
;
; GFX11-LABEL: v_uint_to_fp_v4i64_to_v4f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16
-; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3]
+; GFX11-NEXT: global_load_b128 v[1:4], v5, s[6:7] offset:16
+; GFX11-NEXT: global_load_b128 v[5:8], v5, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_clz_i32_u32_e32 v9, v4
; GFX11-NEXT: v_clz_i32_u32_e32 v10, v2
@@ -825,7 +825,7 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
; GFX11-NEXT: v_pack_b32_f16 v0, v4, v2
-; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v5, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
index 5f8d0f6..f4debc2 100644
--- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
@@ -25,38 +25,38 @@ define amdgpu_kernel void @uitofp_i16_to_f16(
;
; VI-LABEL: uitofp_i16_to_f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f16_u16_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: uitofp_i16_to_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -91,41 +91,41 @@ define amdgpu_kernel void @uitofp_i32_to_f16(
;
; VI-LABEL: uitofp_i32_to_f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: uitofp_i32_to_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -168,44 +168,44 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16(
;
; VI-LABEL: uitofp_v2i16_to_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f16_u16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_cvt_f16_u16_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: uitofp_v2i16_to_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_u16_e32 v1, v1
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -244,39 +244,39 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16(
;
; VI-LABEL: uitofp_v2i32_to_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_u32_e32 v1, v1
; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
; VI-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: uitofp_v2i32_to_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
@@ -285,7 +285,7 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16(
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -357,20 +357,19 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -378,9 +377,10 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_xor_b32 s0, s0, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
index f60a274..5fc395b 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -244,12 +244,12 @@ define amdgpu_kernel void @uniform_if_move_valu(ptr addrspace(1) %out, float %a)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0
; VI-NEXT: s_cbranch_vccnz .LBB4_2
; VI-NEXT: ; %bb.1: ; %if
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: .LBB4_2: ; %endif
; VI-NEXT: s_endpgm
entry:
@@ -296,12 +296,12 @@ define amdgpu_kernel void @uniform_if_move_valu_commute(ptr addrspace(1) %out, f
; VI-NEXT: v_cmp_gt_u32_e32 vcc, 6, v0
; VI-NEXT: s_cbranch_vccnz .LBB5_2
; VI-NEXT: ; %bb.1: ; %if
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: .LBB5_2: ; %endif
; VI-NEXT: s_endpgm
entry:
@@ -342,20 +342,19 @@ define amdgpu_kernel void @uniform_if_else_ret(ptr addrspace(1) nocapture %out,
; VI-LABEL: uniform_if_else_ret:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc0 .LBB6_2
; VI-NEXT: ; %bb.1: ; %if.else
-; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: v_mov_b32_e32 v0, 2
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB6_2: ; %if.then
-; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: v_mov_b32_e32 v0, 1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
entry:
%cmp = icmp eq i32 %a, 0
@@ -403,28 +402,29 @@ define amdgpu_kernel void @uniform_if_else(ptr addrspace(1) nocapture %out0, ptr
;
; VI-LABEL: uniform_if_else:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u32 s4, 0
+; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc0 .LBB7_2
; VI-NEXT: ; %bb.1: ; %if.else
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_mov_b32_e32 v0, 2
; VI-NEXT: s_branch .LBB7_3
; VI-NEXT: .LBB7_2: ; %if.then
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_mov_b32_e32 v0, 1
; VI-NEXT: .LBB7_3: ; %if.end
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: v_mov_b32_e32 v0, 3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
entry:
%cmp = icmp eq i32 %a, 0
@@ -530,13 +530,13 @@ define amdgpu_kernel void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, p
; VI-NEXT: .LBB9_2: ; %bb9
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB9_3: ; %bb7
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
@@ -626,20 +626,20 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 %
; VI-NEXT: s_and_saveexec_b64 s[2:3], vcc
; VI-NEXT: s_cbranch_execz .LBB11_2
; VI-NEXT: ; %bb.1: ; %if
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u32 s4, 0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_cbranch_scc0 .LBB11_3
; VI-NEXT: .LBB11_2: ; %endif
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB11_3: ; %if_uniform
; VI-NEXT: v_mov_b32_e32 v0, 1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
@@ -692,18 +692,18 @@ define amdgpu_kernel void @divergent_inside_uniform(ptr addrspace(1) %out, i32 %
; VI-NEXT: .LBB12_1: ; %endif
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB12_2: ; %if
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
-; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; VI-NEXT: s_and_saveexec_b64 s[0:1], vcc
; VI-NEXT: s_cbranch_execz .LBB12_1
; VI-NEXT: ; %bb.3: ; %if_uniform
; VI-NEXT: v_mov_b32_e32 v0, 1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
entry:
%u_cmp = icmp eq i32 %cond, 0
@@ -832,16 +832,16 @@ define amdgpu_kernel void @cse_uniform_condition_different_blocks(i32 %cond, ptr
; VI-NEXT: s_cmp_lt_i32 s2, 1
; VI-NEXT: s_cbranch_scc1 .LBB14_2
; VI-NEXT: ; %bb.1: ; %bb2
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: .LBB14_2: ; %bb9
; VI-NEXT: s_endpgm
bb:
@@ -886,20 +886,20 @@ define amdgpu_kernel void @uniform_if_scc_i64_eq(i64 %cond, ptr addrspace(1) %ou
;
; VI-LABEL: uniform_if_scc_i64_eq:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_eq_u64 s[0:1], 0
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_eq_u64 s[4:5], 0
; VI-NEXT: s_cbranch_scc1 .LBB15_2
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_mov_b32 s0, 1
; VI-NEXT: .LBB15_2: ; %done
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i64 %cond, 0
@@ -940,20 +940,20 @@ define amdgpu_kernel void @uniform_if_scc_i64_ne(i64 %cond, ptr addrspace(1) %ou
;
; VI-LABEL: uniform_if_scc_i64_ne:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u64 s[4:5], 0
; VI-NEXT: s_cbranch_scc1 .LBB16_2
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_mov_b32 s0, 1
; VI-NEXT: .LBB16_2: ; %done
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
entry:
%cmp0 = icmp ne i64 %cond, 0
@@ -994,21 +994,21 @@ define amdgpu_kernel void @uniform_if_scc_i64_sgt(i64 %cond, ptr addrspace(1) %o
;
; VI-LABEL: uniform_if_scc_i64_sgt:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_gt_i64_e64 s[4:5], s[0:1], 0
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: s_and_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[4:5], 0
+; VI-NEXT: s_and_b64 vcc, exec, s[2:3]
; VI-NEXT: s_cbranch_vccnz .LBB17_2
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_mov_b32 s0, 1
; VI-NEXT: .LBB17_2: ; %done
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
entry:
%cmp0 = icmp sgt i64 %cond, 0
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index 666ae7c1..092d74f 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -29,12 +29,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; VI-LABEL: s_usubo_i64_zext:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_sub_u32 s0, s6, s0
+; VI-NEXT: s_sub_u32 s0, s6, s2
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_subb_u32 s1, s7, s1
+; VI-NEXT: s_subb_u32 s1, s7, s3
; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -96,12 +96,12 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; VI-LABEL: s_usubo_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_sub_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_sub_u32_e32 v4, vcc, s2, v4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -161,18 +161,18 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_usubo_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: flat_load_dword v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: flat_load_dword v5, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v5
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -182,16 +182,16 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_usubo_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT: global_store_byte v0, v2, s[6:7]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -243,18 +243,18 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
;
; VI-LABEL: v_usubo_i32_novcc:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: flat_load_dword v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: flat_load_dword v5, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v5
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -268,19 +268,19 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
;
; GFX9-LABEL: v_usubo_i32_novcc:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX9-NEXT: global_store_byte v0, v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -325,19 +325,19 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: s_usubo_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_sub_u32 s0, s4, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_subb_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_sub_u32 s0, s8, s10
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: s_subb_u32 s1, s9, s11
+; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v6, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: flat_store_byte v[2:3], v0
@@ -345,19 +345,19 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: s_usubo_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s6, s4, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_subb_u32 s7, s5, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: s_sub_u32 s0, s8, s10
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_subb_u32 s1, s9, s11
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
-; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX9-NEXT: global_store_byte v4, v0, s[6:7]
; GFX9-NEXT: s_endpgm
%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %usub, 0
@@ -401,18 +401,18 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_usubo_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s2
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v6, s6
+; VI-NEXT: v_mov_b32_e32 v7, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_u32_e32 v2, vcc, v0, v2
; VI-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
@@ -424,18 +424,18 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_usubo_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
+; GFX9-NEXT: global_store_byte v4, v0, s[6:7]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -486,18 +486,18 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_usubo_i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: flat_load_ushort v4, v[0:1]
+; VI-NEXT: flat_load_ushort v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: flat_load_ushort v4, v[0:1]
-; VI-NEXT: flat_load_ushort v5, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_u16_e32 v5, v4, v5
; VI-NEXT: v_cmp_gt_u16_e32 vcc, v5, v4
@@ -508,17 +508,17 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_usubo_i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[4:5]
-; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
+; GFX9-NEXT: global_load_ushort v1, v0, s[8:9]
+; GFX9-NEXT: global_load_ushort v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u16_e32 v2, v1, v2
; GFX9-NEXT: v_cmp_gt_u16_e32 vcc, v2, v1
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: global_store_short v0, v2, s[0:1]
-; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT: global_store_short v0, v2, s[4:5]
+; GFX9-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -568,18 +568,18 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; VI-LABEL: v_usubo_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s2
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v6, s6
+; VI-NEXT: v_mov_b32_e32 v7, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_u32_e32 v1, vcc, v1, v3
; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
@@ -591,18 +591,18 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX9-LABEL: v_usubo_v2i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7]
; GFX9-NEXT: s_endpgm
%a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
%b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll
index ca4d689..f20a92d 100644
--- a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll
@@ -25,16 +25,16 @@ bb:
define amdgpu_kernel void @test_add_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 {
; GFX9-LABEL: test_add_co_sdwa:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v4, v2, s[2:3]
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1]
+; GFX9-NEXT: global_load_dword v4, v2, s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index e5de7d0..27dcdf9 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -40,15 +40,15 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_eq_u32 s2, 0
+; VI-NEXT: s_cmp_eq_u32 s4, 0
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -79,14 +79,14 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cmp_eq_u32 s2, 0
+; GFX11-NEXT: s_cmp_eq_u32 s4, 0
; GFX11-NEXT: s_cselect_b64 vcc, -1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -185,14 +185,14 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o
; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v2, s1
-; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -213,13 +213,13 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s2, 0
+; GFX11-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s3, s[4:5]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s3, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -279,13 +279,13 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %o
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0
+; GFX11-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[2:3]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -317,14 +317,14 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o
; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v2, s1
-; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -345,13 +345,13 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s2, 0
+; GFX11-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s3, s[4:5]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s3, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -411,13 +411,13 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0
+; GFX11-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[2:3]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -494,13 +494,13 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
+; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -579,13 +579,13 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
+; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -703,14 +703,14 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o
; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -744,12 +744,12 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 0, v1
; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
@@ -795,14 +795,14 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o
; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -836,12 +836,12 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
; GFX11-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc
@@ -888,14 +888,14 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o
; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -931,12 +931,12 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b64 v[2:3], v4, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
@@ -988,15 +988,15 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1)
; VI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v5
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT: flat_load_dword v6, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1037,13 +1037,13 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v5, v1, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v5
; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
@@ -1097,15 +1097,15 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1)
; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v5
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT: flat_load_dword v6, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1146,13 +1146,13 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v5, v1, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v5
; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
@@ -1208,15 +1208,15 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1)
; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v5
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT: flat_load_dword v6, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1257,13 +1257,13 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v5, v1, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v5
; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
@@ -1316,14 +1316,14 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou
; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; VI-NEXT: v_mov_b32_e32 v4, s1
-; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v4, s3
+; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; VI-NEXT: flat_load_dword v2, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1363,12 +1363,12 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v1, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u8 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
@@ -1424,15 +1424,15 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1)
; VI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v5
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT: flat_load_dword v6, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1470,13 +1470,13 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v1, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 0, v3
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc
@@ -1527,15 +1527,15 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1)
; VI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v5
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT: flat_load_dword v6, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1572,13 +1572,13 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v1, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v3
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1626,14 +1626,14 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1)
; VI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1667,12 +1667,12 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1
; GFX11-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc
@@ -1723,14 +1723,14 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add
; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1772,12 +1772,12 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc
@@ -1839,17 +1839,17 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c,
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1882,17 +1882,17 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c,
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-NEXT: s_cselect_b64 vcc, -1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v2, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1937,15 +1937,15 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c,
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u32 s2, 0
-; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT: s_cmp_lg_u32 s4, 0
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cndmask_b32_e64 v2, -v0, |v0|, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_cndmask_b32_e64 v2, -v0, |v0|, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1976,14 +1976,14 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c,
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX11-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2032,18 +2032,18 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c,
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -2077,10 +2077,10 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c,
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-NEXT: s_cselect_b64 vcc, -1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
@@ -2088,7 +2088,7 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c,
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
index f7933d7..4b76d5c 100644
--- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
@@ -56,25 +56,25 @@ define amdgpu_kernel void @madak_f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_f16_e32 v0, 0x4900, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -170,15 +170,15 @@ define amdgpu_kernel void @madak_f16_use_2(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
-; GFX11-NEXT: s_mov_b32 s14, -1
-; GFX11-NEXT: s_mov_b32 s15, 0x31016000
-; GFX11-NEXT: s_mov_b32 s18, s14
-; GFX11-NEXT: s_mov_b32 s19, s15
-; GFX11-NEXT: s_mov_b32 s22, s14
-; GFX11-NEXT: s_mov_b32 s23, s15
-; GFX11-NEXT: s_mov_b32 s2, s14
-; GFX11-NEXT: s_mov_b32 s3, s15
+; GFX11-NEXT: s_load_b64 s[12:13], s[0:1], 0x44
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s16, s8
; GFX11-NEXT: s_mov_b32 s17, s9
@@ -188,19 +188,21 @@ define amdgpu_kernel void @madak_f16_use_2(
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v2, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v2, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s4
-; GFX11-NEXT: s_mov_b32 s13, s5
-; GFX11-NEXT: s_mov_b32 s0, s6
-; GFX11-NEXT: s_mov_b32 s1, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
; GFX11-NEXT: v_mul_f16_e32 v1, v0, v1
; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_f16_e32 v1, 0x4900, v1
; GFX11-NEXT: v_add_f16_e32 v0, 0x4900, v0
-; GFX11-NEXT: buffer_store_b16 v1, off, s[12:15], 0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll
index 8bc8fbd..c2abd4f 100644
--- a/llvm/test/CodeGen/AMDGPU/v_pack.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll
@@ -7,12 +7,12 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
; GCN-LABEL: v_pack_b32_v2f16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GCN-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GCN-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1
; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
@@ -24,12 +24,12 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace
;
; GISEL-LABEL: v_pack_b32_v2f16:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GISEL-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GISEL-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
@@ -56,12 +56,12 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace
define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
; GCN-LABEL: v_pack_b32_v2f16_sub:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GCN-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GCN-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_subrev_f16_e32 v0, 2.0, v1
; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
@@ -73,12 +73,12 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs
;
; GISEL-LABEL: v_pack_b32_v2f16_sub:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GISEL-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GISEL-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_subrev_f16_e32 v0, 2.0, v1
; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
@@ -105,36 +105,36 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs
define amdgpu_kernel void @fptrunc(
; GCN-LABEL: fptrunc:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s6, -1
-; GCN-NEXT: s_mov_b32 s7, 0x31016000
-; GCN-NEXT: s_mov_b32 s10, s6
-; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0x31016000
+; GCN-NEXT: s_mov_b32 s10, s2
+; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s8, s2
-; GCN-NEXT: s_mov_b32 s9, s3
-; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s8, s6
+; GCN-NEXT: s_mov_b32 s9, s7
+; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_pack_b32_f16 v0, v0, v1
-; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; GISEL-LABEL: fptrunc:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GISEL-NEXT: s_mov_b32 s6, -1
+; GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GISEL-NEXT: v_cvt_f16_f32_e32 v1, s3
-; GISEL-NEXT: s_mov_b32 s2, -1
-; GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; GISEL-NEXT: v_cvt_f16_f32_e32 v1, s1
; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
-; GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GISEL-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
@@ -147,12 +147,12 @@ define amdgpu_kernel void @fptrunc(
define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
; GCN-LABEL: v_pack_b32.fabs:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GCN-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GCN-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1
; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
@@ -164,12 +164,12 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(
;
; GISEL-LABEL: v_pack_b32.fabs:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GISEL-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GISEL-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
@@ -198,12 +198,12 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(
define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
; GCN-LABEL: v_pack_b32.fneg:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GCN-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GCN-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1
; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
@@ -215,12 +215,12 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(
;
; GISEL-LABEL: v_pack_b32.fneg:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GISEL-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GISEL-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index 7f69c47..6c8f288 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -413,18 +413,18 @@ define <2 x i16> @vec_smax_smin(<2 x i16> %src) {
define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> inreg %src) {
; SDAG-VI-LABEL: vec_smax_smin_sgpr:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: s_lshr_b32 s3, s2, 16
-; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0
-; SDAG-VI-NEXT: v_max_i16_e64 v2, s3, 0
+; SDAG-VI-NEXT: s_lshr_b32 s0, s4, 16
+; SDAG-VI-NEXT: v_max_i16_e64 v1, s4, 0
+; SDAG-VI-NEXT: v_max_i16_e64 v2, s0, 0
; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1
; SDAG-VI-NEXT: v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; SDAG-VI-NEXT: v_or_b32_e32 v2, v1, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
; SDAG-VI-NEXT: flat_store_dword v[0:1], v2
; SDAG-VI-NEXT: s_endpgm
;
@@ -443,41 +443,41 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i
; SDAG-GFX11-LABEL: vec_smax_smin_sgpr:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: v_mov_b32_e32 v1, 0
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_pk_max_i16 v0, s2, 0
+; SDAG-GFX11-NEXT: v_pk_max_i16 v0, s4, 0
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
-; SDAG-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; SDAG-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; GISEL-VI-LABEL: vec_smax_smin_sgpr:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GISEL-VI-NEXT: s_sext_i32_i16 s3, 0
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_sext_i32_i16 s0, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: s_lshr_b32 s4, s2, 16
-; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2
+; GISEL-VI-NEXT: s_lshr_b32 s1, s4, 16
; GISEL-VI-NEXT: s_sext_i32_i16 s4, s4
-; GISEL-VI-NEXT: s_max_i32 s2, s2, s3
-; GISEL-VI-NEXT: s_max_i32 s3, s4, s3
+; GISEL-VI-NEXT: s_sext_i32_i16 s1, s1
+; GISEL-VI-NEXT: s_max_i32 s4, s4, s0
+; GISEL-VI-NEXT: s_max_i32 s0, s1, s0
+; GISEL-VI-NEXT: s_sext_i32_i16 s1, s4
; GISEL-VI-NEXT: s_sext_i32_i16 s4, 0xff
-; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3
-; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2
-; GISEL-VI-NEXT: s_min_i32 s3, s3, s4
-; GISEL-VI-NEXT: s_min_i32 s2, s2, s4
-; GISEL-VI-NEXT: s_and_b32 s3, 0xffff, s3
-; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2
-; GISEL-VI-NEXT: s_lshl_b32 s3, s3, 16
-; GISEL-VI-NEXT: s_or_b32 s2, s2, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: s_sext_i32_i16 s0, s0
+; GISEL-VI-NEXT: s_min_i32 s0, s0, s4
+; GISEL-VI-NEXT: s_min_i32 s1, s1, s4
+; GISEL-VI-NEXT: s_and_b32 s0, 0xffff, s0
+; GISEL-VI-NEXT: s_and_b32 s1, 0xffff, s1
+; GISEL-VI-NEXT: s_lshl_b32 s0, s0, 16
+; GISEL-VI-NEXT: s_or_b32 s0, s1, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
; GISEL-VI-NEXT: flat_store_dword v[0:1], v2
; GISEL-VI-NEXT: s_endpgm
;
@@ -506,26 +506,26 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i
; GISEL-GFX11-LABEL: vec_smax_smin_sgpr:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, 0
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_sext_i32_i16 s0, 0
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, s2
-; GISEL-GFX11-NEXT: s_ashr_i32 s2, s2, 16
-; GISEL-GFX11-NEXT: s_max_i32 s3, s4, s3
-; GISEL-GFX11-NEXT: s_max_i32 s2, s2, 0
+; GISEL-GFX11-NEXT: s_sext_i32_i16 s1, s4
+; GISEL-GFX11-NEXT: s_ashr_i32 s4, s4, 16
+; GISEL-GFX11-NEXT: s_max_i32 s0, s1, s0
+; GISEL-GFX11-NEXT: s_max_i32 s1, s4, 0
; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s2, s3, s2
-; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, 0xff00ff
-; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, s2
-; GISEL-GFX11-NEXT: s_ashr_i32 s2, s2, 16
-; GISEL-GFX11-NEXT: s_min_i32 s3, s4, s3
-; GISEL-GFX11-NEXT: s_min_i32 s2, s2, 0xff
+; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GISEL-GFX11-NEXT: s_sext_i32_i16 s1, 0xff00ff
+; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, s0
+; GISEL-GFX11-NEXT: s_ashr_i32 s0, s0, 16
+; GISEL-GFX11-NEXT: s_min_i32 s1, s4, s1
+; GISEL-GFX11-NEXT: s_min_i32 s0, s0, 0xff
; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s2, s3, s2
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s0, s1, s0
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll
index d5347f8..b60ae19 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll
@@ -25,16 +25,16 @@ bb:
define amdgpu_kernel void @test_sub_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 {
; GFX9-LABEL: test_sub_co_sdwa:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v4, v2, s[2:3]
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1]
+; GFX9-NEXT: global_load_dword v4, v2, s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index 340f0cd..836b1d4 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -473,9 +473,9 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s
; SI-NEXT: bb.1.if.then:
; SI-NEXT: successors: %bb.7(0x80000000)
; SI-NEXT: {{ $}}
- ; SI-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4)
- ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %48, 0, implicit $exec
- ; SI-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+ ; SI-NEXT: early-clobber %27:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4)
+ ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %27.sub0, killed %48, 0, implicit $exec
+ ; SI-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed %27.sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1
; SI-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1)
; SI-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
@@ -570,9 +570,9 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe
; SI-NEXT: bb.1.if.then:
; SI-NEXT: successors: %bb.2(0x80000000)
; SI-NEXT: {{ $}}
- ; SI-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.tex.coerce.kernarg.offset, align 4, addrspace 4)
+ ; SI-NEXT: early-clobber %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.tex.coerce.kernarg.offset, align 4, addrspace 4)
; SI-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 3, killed [[COPY1]](s32), implicit $exec
- ; SI-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR killed [[S_LOAD_DWORDX2_IMM]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s64) from %ir.idx, addrspace 1)
+ ; SI-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR killed %4, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s64) from %ir.idx, addrspace 1)
; SI-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[GLOBAL_LOAD_DWORDX2_SADDR]], 16, 0, implicit $exec :: (invariant load (s128) from %ir.3 + 16, addrspace 4)
; SI-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub3
; SI-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub2
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index f78b408..ea48047 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -286,8 +286,8 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX906-LABEL: v32i8_liveout:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX906-NEXT: v_lshlrev_b32_e32 v31, 5, v0
-; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX906-NEXT: v_mov_b32_e32 v9, 0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
@@ -319,7 +319,7 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX906-NEXT: v_lshrrev_b32_e32 v30, 24, v5
; GFX906-NEXT: v_lshrrev_b32_e32 v32, 16, v5
; GFX906-NEXT: v_lshrrev_b32_e32 v33, 8, v5
-; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB5_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
; GFX906-NEXT: global_load_dwordx4 v[1:4], v31, s[6:7] offset:16
@@ -351,7 +351,7 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX906-NEXT: v_lshrrev_b32_e32 v32, 16, v5
; GFX906-NEXT: v_lshrrev_b32_e32 v33, 8, v5
; GFX906-NEXT: .LBB5_2: ; %bb.2
-; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX906-NEXT: v_lshlrev_b16_e32 v30, 8, v30
; GFX906-NEXT: v_lshlrev_b16_e32 v31, 8, v33
; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29
@@ -372,7 +372,7 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX906-NEXT: v_or_b32_sdwa v6, v6, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX906-NEXT: v_or_b32_sdwa v7, v7, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX906-NEXT: v_or_b32_sdwa v8, v8, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[0:1]
+; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[2:3]
; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v20
; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -393,7 +393,7 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX906-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX906-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[0:1] offset:16
+; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[2:3] offset:16
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index e12a4be..a033d5d 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -8,26 +8,26 @@
define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) {
; GFX1032-LABEL: test_vopc_i32:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX1032-NEXT: global_load_dword v1, v0, s[2:3]
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0, v1
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc_lo
-; GFX1032-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_vopc_i32:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX1064-NEXT: global_load_dword v1, v0, s[2:3]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc
-; GFX1064-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1064-NEXT: s_endpgm
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid
@@ -41,26 +41,26 @@ define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) {
define amdgpu_kernel void @test_vopc_f32(ptr addrspace(1) %arg) {
; GFX1032-LABEL: test_vopc_f32:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX1032-NEXT: global_load_dword v1, v0, s[2:3]
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_nge_f32_e32 vcc_lo, 0, v1
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, vcc_lo
-; GFX1032-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_vopc_f32:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX1064-NEXT: global_load_dword v1, v0, s[2:3]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_nge_f32_e32 vcc, 0, v1
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, vcc
-; GFX1064-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1064-NEXT: s_endpgm
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %lid
@@ -101,28 +101,28 @@ define amdgpu_ps void @test_vopc_vcmp(float %x) {
define amdgpu_kernel void @test_vopc_2xf16(ptr addrspace(1) %arg) {
; GFX1032-LABEL: test_vopc_2xf16:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX1032-NEXT: global_load_dword v1, v0, s[2:3]
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_le_f16_sdwa vcc_lo, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
; GFX1032-NEXT: v_cndmask_b32_e32 v1, 0x3c003c00, v1, vcc_lo
-; GFX1032-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_vopc_2xf16:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX1064-NEXT: global_load_dword v1, v0, s[2:3]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_le_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
; GFX1064-NEXT: v_cndmask_b32_e32 v1, 0x3c003c00, v1, vcc
-; GFX1064-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1064-NEXT: s_endpgm
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i32 %lid
@@ -321,10 +321,10 @@ define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB9_2
; GFX1032-NEXT: ; %bb.1: ; %if
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX1032-NEXT: global_store_dword v0, v0, s[2:3]
; GFX1032-NEXT: .LBB9_2: ; %endif
; GFX1032-NEXT: s_endpgm
;
@@ -334,10 +334,10 @@ define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 {
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB9_2
; GFX1064-NEXT: ; %bb.1: ; %if
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX1064-NEXT: global_store_dword v0, v0, s[2:3]
; GFX1064-NEXT: .LBB9_2: ; %endif
; GFX1064-NEXT: s_endpgm
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -355,9 +355,9 @@ endif:
define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
; GFX1032-LABEL: test_loop_with_if:
; GFX1032: ; %bb.0: ; %bb
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s0, 0
; GFX1032-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1032-NEXT: s_branch .LBB10_2
; GFX1032-NEXT: .LBB10_1: ; %bb13
@@ -366,25 +366,25 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfe, v4
; GFX1032-NEXT: v_add_nc_u32_e32 v1, 1, v4
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX1032-NEXT: s_cbranch_execz .LBB10_8
; GFX1032-NEXT: .LBB10_2: ; %bb2
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_cmp_ge_i32_e64 s4, v1, v0
; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, v1, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0
+; GFX1032-NEXT: s_mov_b32 s1, 0
; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB10_4
; GFX1032-NEXT: ; %bb.3: ; %bb5
; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1
; GFX1032-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX1032-NEXT: s_andn2_b32 s4, s4, exec_lo
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_mov_b32 s1, exec_lo
; GFX1032-NEXT: v_lshlrev_b64 v[2:3], 2, v[1:2]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_add_co_u32 v2, vcc_lo, s0, v2
-; GFX1032-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo
+; GFX1032-NEXT: v_add_co_u32 v2, vcc_lo, s2, v2
+; GFX1032-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
; GFX1032-NEXT: global_load_dword v4, v[2:3], off
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 11, v4
@@ -399,13 +399,13 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: ; %bb.5: ; %bb11
; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1
; GFX1032-NEXT: v_lshrrev_b32_e32 v4, 31, v1
-; GFX1032-NEXT: s_andn2_b32 s3, s3, exec_lo
+; GFX1032-NEXT: s_andn2_b32 s1, s1, exec_lo
; GFX1032-NEXT: v_add_nc_u32_e32 v4, v1, v4
; GFX1032-NEXT: v_ashrrev_i32_e32 v4, 1, v4
; GFX1032-NEXT: ; %bb.6: ; %Flow1
; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT: s_and_saveexec_b32 s4, s3
+; GFX1032-NEXT: s_and_saveexec_b32 s4, s1
; GFX1032-NEXT: s_cbranch_execz .LBB10_1
; GFX1032-NEXT: ; %bb.7: ; %bb10
; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1
@@ -417,9 +417,9 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
;
; GFX1064-LABEL: test_loop_with_if:
; GFX1064: ; %bb.0: ; %bb
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1064-NEXT: s_branch .LBB10_2
; GFX1064-NEXT: .LBB10_1: ; %bb13
@@ -428,8 +428,8 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, 0xfe, v4
; GFX1064-NEXT: v_add_nc_u32_e32 v1, 1, v4
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execz .LBB10_8
; GFX1064-NEXT: .LBB10_2: ; %bb2
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -445,8 +445,8 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
; GFX1064-NEXT: s_mov_b64 s[4:5], exec
; GFX1064-NEXT: v_lshlrev_b64 v[2:3], 2, v[1:2]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_add_co_u32 v2, vcc, s0, v2
-; GFX1064-NEXT: v_add_co_ci_u32_e32 v3, vcc, s1, v3, vcc
+; GFX1064-NEXT: v_add_co_u32 v2, vcc, s2, v2
+; GFX1064-NEXT: v_add_co_ci_u32_e32 v3, vcc, s3, v3, vcc
; GFX1064-NEXT: global_load_dword v4, v[2:3], off
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_gt_i32_e32 vcc, 11, v4
@@ -516,43 +516,43 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) #
; GFX1032-LABEL: test_loop_with_if_else_break:
; GFX1032: ; %bb.0: ; %bb
; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB11_6
; GFX1032-NEXT: ; %bb.1: ; %.preheader
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_min_u32_e32 v1, 0x100, v0
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-NEXT: s_mov_b32 s3, 0
-; GFX1032-NEXT: ; implicit-def: $sgpr4
+; GFX1032-NEXT: s_mov_b32 s0, 0
+; GFX1032-NEXT: ; implicit-def: $sgpr1
; GFX1032-NEXT: s_branch .LBB11_4
; GFX1032-NEXT: .LBB11_2: ; %bb8
; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1
-; GFX1032-NEXT: s_add_i32 s3, s3, 1
-; GFX1032-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX1032-NEXT: v_cmp_ge_u32_e32 vcc_lo, s3, v1
-; GFX1032-NEXT: s_add_u32 s0, s0, 4
-; GFX1032-NEXT: s_addc_u32 s1, s1, 0
-; GFX1032-NEXT: s_andn2_b32 s4, s4, exec_lo
+; GFX1032-NEXT: s_add_i32 s0, s0, 1
+; GFX1032-NEXT: global_store_dword v2, v0, s[2:3]
+; GFX1032-NEXT: v_cmp_ge_u32_e32 vcc_lo, s0, v1
+; GFX1032-NEXT: s_add_u32 s2, s2, 4
+; GFX1032-NEXT: s_addc_u32 s3, s3, 0
+; GFX1032-NEXT: s_andn2_b32 s1, s1, exec_lo
; GFX1032-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX1032-NEXT: s_or_b32 s4, s4, s5
+; GFX1032-NEXT: s_or_b32 s1, s1, s5
; GFX1032-NEXT: .LBB11_3: ; %Flow
; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1
-; GFX1032-NEXT: s_and_b32 s5, exec_lo, s4
-; GFX1032-NEXT: s_or_b32 s2, s5, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_and_b32 s5, exec_lo, s1
+; GFX1032-NEXT: s_or_b32 s4, s5, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execz .LBB11_6
; GFX1032-NEXT: .LBB11_4: ; %bb2
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dword v3, v2, s[0:1]
-; GFX1032-NEXT: s_or_b32 s4, s4, exec_lo
+; GFX1032-NEXT: global_load_dword v3, v2, s[2:3]
+; GFX1032-NEXT: s_or_b32 s1, s1, exec_lo
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 11, v3
; GFX1032-NEXT: s_cbranch_vccz .LBB11_2
; GFX1032-NEXT: ; %bb.5: ; in Loop: Header=BB11_4 Depth=1
-; GFX1032-NEXT: ; implicit-def: $sgpr3
-; GFX1032-NEXT: ; implicit-def: $sgpr0_sgpr1
+; GFX1032-NEXT: ; implicit-def: $sgpr0
+; GFX1032-NEXT: ; implicit-def: $sgpr2_sgpr3
; GFX1032-NEXT: s_branch .LBB11_3
; GFX1032-NEXT: .LBB11_6: ; %.loopexit
; GFX1032-NEXT: s_endpgm
@@ -564,39 +564,39 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) #
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB11_6
; GFX1064-NEXT: ; %bb.1: ; %.preheader
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_min_u32_e32 v1, 0x100, v0
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX1064-NEXT: s_branch .LBB11_4
; GFX1064-NEXT: .LBB11_2: ; %bb8
; GFX1064-NEXT: ; in Loop: Header=BB11_4 Depth=1
; GFX1064-NEXT: s_add_i32 s6, s6, 1
-; GFX1064-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX1064-NEXT: global_store_dword v2, v0, s[2:3]
; GFX1064-NEXT: v_cmp_ge_u32_e32 vcc, s6, v1
-; GFX1064-NEXT: s_add_u32 s0, s0, 4
-; GFX1064-NEXT: s_addc_u32 s1, s1, 0
+; GFX1064-NEXT: s_add_u32 s2, s2, 4
+; GFX1064-NEXT: s_addc_u32 s3, s3, 0
; GFX1064-NEXT: s_andn2_b64 s[4:5], s[4:5], exec
; GFX1064-NEXT: s_and_b64 s[8:9], vcc, exec
; GFX1064-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
; GFX1064-NEXT: .LBB11_3: ; %Flow
; GFX1064-NEXT: ; in Loop: Header=BB11_4 Depth=1
; GFX1064-NEXT: s_and_b64 s[8:9], exec, s[4:5]
-; GFX1064-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execz .LBB11_6
; GFX1064-NEXT: .LBB11_4: ; %bb2
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dword v3, v2, s[0:1]
+; GFX1064-NEXT: global_load_dword v3, v2, s[2:3]
; GFX1064-NEXT: s_or_b64 s[4:5], s[4:5], exec
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_gt_i32_e32 vcc, 11, v3
; GFX1064-NEXT: s_cbranch_vccz .LBB11_2
; GFX1064-NEXT: ; %bb.5: ; in Loop: Header=BB11_4 Depth=1
; GFX1064-NEXT: ; implicit-def: $sgpr6
-; GFX1064-NEXT: ; implicit-def: $sgpr0_sgpr1
+; GFX1064-NEXT: ; implicit-def: $sgpr2_sgpr3
; GFX1064-NEXT: s_branch .LBB11_3
; GFX1064-NEXT: .LBB11_6: ; %.loopexit
; GFX1064-NEXT: s_endpgm
@@ -631,26 +631,26 @@ bb8:
define amdgpu_kernel void @test_addc_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
; GFX1032-LABEL: test_addc_vop2b:
; GFX1032: ; %bb.0: ; %bb
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, v0, s2
-; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, v0, s6
+; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s7, v1, vcc_lo
+; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_addc_vop2b:
; GFX1064: ; %bb.0: ; %bb
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_co_u32 v0, vcc, v0, s2
-; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: v_add_co_u32 v0, vcc, v0, s6
+; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s7, v1, vcc
+; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1064-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -664,26 +664,26 @@ bb:
define amdgpu_kernel void @test_subbrev_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
; GFX1032-LABEL: test_subbrev_vop2b:
; GFX1032: ; %bb.0: ; %bb
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s2
-; GFX1032-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s6
+; GFX1032-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s7, v1, vcc_lo
+; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_subbrev_vop2b:
; GFX1064: ; %bb.0: ; %bb
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_sub_co_u32 v0, vcc, v0, s2
-; GFX1064-NEXT: v_subrev_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: v_sub_co_u32 v0, vcc, v0, s6
+; GFX1064-NEXT: v_subrev_co_ci_u32_e32 v1, vcc, s7, v1, vcc
+; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1064-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -697,26 +697,26 @@ bb:
define amdgpu_kernel void @test_subb_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
; GFX1032-LABEL: test_subb_vop2b:
; GFX1032: ; %bb.0: ; %bb
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s6, v0
+; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s7, v1, vcc_lo
+; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_subb_vop2b:
; GFX1064: ; %bb.0: ; %bb
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s6, v0
+; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s7, v1, vcc
+; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1064-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -1063,30 +1063,30 @@ bb:
define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX1032-LABEL: test_div_scale_f32:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX1032-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX1032-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_div_scale_f32 v1, s2, v2, v2, v1
-; GFX1032-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032-NEXT: v_div_scale_f32 v1, s0, v2, v2, v1
+; GFX1032-NEXT: global_store_dword v0, v1, s[4:5]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_div_scale_f32:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX1064-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX1064-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064-NEXT: v_div_scale_f32 v1, s[2:3], v2, v2, v1
-; GFX1064-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], v2, v2, v1
+; GFX1064-NEXT: global_store_dword v0, v1, s[4:5]
; GFX1064-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -1106,30 +1106,32 @@ define amdgpu_kernel void @test_div_scale_f64(ptr addrspace(1) %out, ptr addrspa
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1]
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_div_scale_f64:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1064-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
@@ -1451,11 +1453,11 @@ define amdgpu_kernel void @test_brcc_i1(ptr addrspace(1) noalias %out, ptr addrs
; GCN-NEXT: s_bitcmp0_b32 s2, 0
; GCN-NEXT: s_cbranch_scc1 .LBB25_2
; GCN-NEXT: ; %bb.1: ; %store
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0xde
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN-NEXT: global_store_dword v0, v1, s[2:3]
; GCN-NEXT: .LBB25_2: ; %end
; GCN-NEXT: s_endpgm
%cmp0 = icmp ne i1 %val, 0
@@ -1634,7 +1636,7 @@ define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1)
; GFX1032-LABEL: test_movrels_extract_neg_offset_vgpr:
; GFX1032: ; %bb.0: ; %entry
; GFX1032-NEXT: v_add_nc_u32_e32 v0, 0xfffffe00, v0
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
@@ -1643,13 +1645,13 @@ define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1)
; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 3, v0
; GFX1032-NEXT: v_cndmask_b32_e32 v0, 3, v1, vcc_lo
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX1032-NEXT: global_store_dword v2, v0, s[2:3]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_movrels_extract_neg_offset_vgpr:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: v_add_nc_u32_e32 v0, 0xfffffe00, v0
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1658,7 +1660,7 @@ define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1)
; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0
; GFX1064-NEXT: v_cndmask_b32_e32 v0, 3, v1, vcc
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX1064-NEXT: global_store_dword v2, v0, s[2:3]
; GFX1064-NEXT: s_endpgm
entry:
%id = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -1704,30 +1706,30 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0
define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) #0 {
; GFX1032-LABEL: test_set_inactive_64:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v0, s2
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v0, s6
+; GFX1032-NEXT: v_mov_b32_e32 v1, s7
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_set_inactive_64:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v0, s2
-; GFX1064-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064-NEXT: v_mov_b32_e32 v1, s7
; GFX1064-NEXT: s_not_b64 exec, exec
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: s_not_b64 exec, exec
-; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1064-NEXT: s_endpgm
%tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0)
store i64 %tmp, ptr addrspace(1) %out
@@ -2354,42 +2356,42 @@ define amdgpu_ps float @test_ps_live() #0 {
define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX1032-LABEL: test_vccnz_ifcvt_triangle64:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_cmp_neq_f64_e64 s4, s[2:3], 1.0
-; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, s4
+; GFX1032-NEXT: v_cmp_neq_f64_e64 s2, s[0:1], 1.0
+; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX1032-NEXT: s_cbranch_vccnz .LBB47_2
; GFX1032-NEXT: ; %bb.1: ; %if
-; GFX1032-NEXT: v_add_f64 v[0:1], s[2:3], s[2:3]
+; GFX1032-NEXT: v_add_f64 v[0:1], s[0:1], s[0:1]
; GFX1032-NEXT: s_branch .LBB47_3
; GFX1032-NEXT: .LBB47_2:
-; GFX1032-NEXT: v_mov_b32_e32 v0, s2
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v0, s0
+; GFX1032-NEXT: v_mov_b32_e32 v1, s1
; GFX1032-NEXT: .LBB47_3: ; %endif
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_vccnz_ifcvt_triangle64:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_cmp_neq_f64_e64 s[4:5], s[2:3], 1.0
-; GFX1064-NEXT: s_and_b64 vcc, exec, s[4:5]
+; GFX1064-NEXT: v_cmp_neq_f64_e64 s[2:3], s[0:1], 1.0
+; GFX1064-NEXT: s_and_b64 vcc, exec, s[2:3]
; GFX1064-NEXT: s_cbranch_vccnz .LBB47_2
; GFX1064-NEXT: ; %bb.1: ; %if
-; GFX1064-NEXT: v_add_f64 v[0:1], s[2:3], s[2:3]
+; GFX1064-NEXT: v_add_f64 v[0:1], s[0:1], s[0:1]
; GFX1064-NEXT: s_branch .LBB47_3
; GFX1064-NEXT: .LBB47_2:
-; GFX1064-NEXT: v_mov_b32_e32 v0, s2
-; GFX1064-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064-NEXT: v_mov_b32_e32 v0, s0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s1
; GFX1064-NEXT: .LBB47_3: ; %endif
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1064-NEXT: s_endpgm
entry:
%v = load double, ptr addrspace(1) %in
diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index e0b320a..025b856 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -22,11 +22,11 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) {
;
; VI-LABEL: widen_i16_constant_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_addk_i32 s0, 0x3e7
; VI-NEXT: s_or_b32 s0, s0, 4
@@ -36,10 +36,10 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) {
;
; GFX11-LABEL: widen_i16_constant_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s0, 0x3e7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -76,11 +76,11 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %ar
;
; VI-LABEL: widen_i16_constant_load_zext_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: s_addk_i32 s0, 0x3e7
@@ -91,10 +91,10 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %ar
;
; GFX11-LABEL: widen_i16_constant_load_zext_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -134,11 +134,11 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %ar
;
; VI-LABEL: widen_i16_constant_load_sext_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_sext_i32_i16 s0, s0
; VI-NEXT: s_addk_i32 s0, 0x3e7
@@ -149,10 +149,10 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %ar
;
; GFX11-LABEL: widen_i16_constant_load_sext_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sext_i32_i16 s0, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -199,13 +199,13 @@ define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) {
;
; VI-LABEL: widen_i17_constant_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, 2
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_i32 s0, s0, 34
; VI-NEXT: s_or_b32 s0, s0, 4
@@ -218,10 +218,10 @@ define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) {
;
; GFX11-LABEL: widen_i17_constant_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_i32 s0, s0, 34
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -263,11 +263,11 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) {
;
; VI-LABEL: widen_f16_constant_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f16_e64 v2, s0, 4.0
; VI-NEXT: flat_store_short v[0:1], v2
@@ -275,11 +275,11 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) {
;
; GFX11-LABEL: widen_f16_constant_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_f16_e64 v2, s0, 4.0
; GFX11-NEXT: global_store_b16 v[0:1], v2, off
@@ -317,11 +317,11 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) {
;
; VI-LABEL: widen_v2i8_constant_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 44
; VI-NEXT: v_mov_b32_e32 v1, 3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s1, s0, 0xffff
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -338,9 +338,9 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) {
;
; GFX11-LABEL: widen_v2i8_constant_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_nc_u16 v0, s0, 12
; GFX11-NEXT: v_and_b32_e64 v1, 0xffffff00, s0
@@ -387,11 +387,11 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4)
;
; VI-LABEL: no_widen_i16_constant_divergent_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -404,10 +404,10 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4)
;
; GFX11-LABEL: no_widen_i16_constant_divergent_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_nc_u16 v2, v0, 0x3e7
; GFX11-NEXT: v_mov_b32_e32 v0, 0
@@ -446,11 +446,11 @@ define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) {
;
; VI-LABEL: widen_i1_constant_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s0, s0, 1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -459,10 +459,10 @@ define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) {
;
; GFX11-LABEL: widen_i1_constant_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s0, s0, 1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -497,11 +497,11 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4)
;
; VI-LABEL: widen_i16_zextload_i64_constant_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: s_addk_i32 s0, 0x3e7
@@ -512,10 +512,10 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4)
;
; GFX11-LABEL: widen_i16_zextload_i64_constant_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -556,11 +556,11 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) %
;
; VI-LABEL: widen_i1_zext_to_i64_constant_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s0, s0, 1
; VI-NEXT: s_add_u32 s0, s0, 0x3e7
@@ -572,9 +572,9 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) %
;
; GFX11-LABEL: widen_i1_zext_to_i64_constant_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s0, s0, 1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
@@ -667,11 +667,11 @@ define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg
;
; VI-LABEL: widen_i16_global_invariant_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_addk_i32 s0, 0x3e7
; VI-NEXT: s_or_b32 s0, s0, 1
@@ -681,10 +681,10 @@ define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg
;
; GFX11-LABEL: widen_i16_global_invariant_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s0, 0x3e7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll
index 5422bfa..54240ad 100644
--- a/llvm/test/CodeGen/AMDGPU/xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor.ll
@@ -29,12 +29,12 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; VI-LABEL: xor_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
@@ -80,12 +80,12 @@ define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; VI-LABEL: xor_v4i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; VI-NEXT: v_mov_b32_e32 v8, s4
@@ -134,12 +134,12 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0,
; VI-LABEL: xor_i1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
@@ -190,12 +190,12 @@ define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0
; VI-LABEL: v_xor_i1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_load_ubyte v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_ubyte v2, v[2:3] glc
@@ -239,12 +239,12 @@ define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1
; VI-LABEL: vector_xor_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
@@ -304,13 +304,13 @@ define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) {
;
; VI-LABEL: scalar_not_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_not_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_not_b32 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%result = xor i32 %a, -1
@@ -339,13 +339,13 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: vector_not_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_load_dword v2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_not_b32_e32 v2, v2
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -384,12 +384,12 @@ define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1
; VI-LABEL: vector_xor_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
@@ -425,10 +425,10 @@ define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b)
; VI-LABEL: scalar_xor_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1]
+; VI-NEXT: s_xor_b64 s[0:1], s[6:7], s[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -456,12 +456,12 @@ define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) {
;
; VI-LABEL: scalar_not_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_not_b64 s[0:1], s[2:3]
+; VI-NEXT: s_not_b64 s[0:1], s[6:7]
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -492,13 +492,13 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: vector_not_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_not_b32_e32 v0, v0
; VI-NEXT: v_not_b32_e32 v1, v1
@@ -545,25 +545,25 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i
;
; VI-LABEL: xor_cf:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[8:9], 0
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u64 s[4:5], 0
+; VI-NEXT: s_cmp_lg_u64 s[8:9], 0
; VI-NEXT: s_cbranch_scc0 .LBB12_4
; VI-NEXT: ; %bb.1: ; %else
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_andn2_b64 vcc, exec, s[8:9]
+; VI-NEXT: s_andn2_b64 vcc, exec, s[0:1]
; VI-NEXT: s_cbranch_vccnz .LBB12_3
; VI-NEXT: .LBB12_2: ; %if
-; VI-NEXT: s_xor_b64 s[2:3], s[4:5], s[6:7]
+; VI-NEXT: s_xor_b64 s[0:1], s[8:9], s[10:11]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: .LBB12_3: ; %endif
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -606,14 +606,14 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3
; VI-LABEL: scalar_xor_literal_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_xor_b32 s3, s3, 0xf237b
-; VI-NEXT: s_xor_b32 s2, s2, 0x3039
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_xor_b32 s0, s3, 0xf237b
+; VI-NEXT: s_xor_b32 s1, s2, 0x3039
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%or = xor i64 %a, 4261135838621753
@@ -647,15 +647,15 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %ou
; VI-LABEL: scalar_xor_literal_multi_use_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_movk_i32 s2, 0x3039
-; VI-NEXT: s_mov_b32 s3, 0xf237b
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_movk_i32 s0, 0x3039
+; VI-NEXT: s_mov_b32 s1, 0xf237b
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_add_u32 s0, s6, 0x3039
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_addc_u32 s1, s7, 0xf237b
@@ -689,13 +689,13 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x
; VI-LABEL: scalar_xor_inline_imm_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_xor_b32 s2, s2, 63
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_xor_b32 s0, s2, 63
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%or = xor i64 %a, 63
@@ -720,13 +720,13 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out,
; VI-LABEL: scalar_xor_neg_inline_imm_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -8
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_xor_b64 s[0:1], s[2:3], -8
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
%or = xor i64 %a, -8
@@ -756,13 +756,13 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out,
;
; VI-LABEL: vector_xor_i64_neg_inline_imm:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_xor_b32_e32 v0, -8, v0
; VI-NEXT: v_xor_b32_e32 v1, -1, v1
@@ -796,13 +796,13 @@ define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr add
;
; VI-LABEL: vector_xor_literal_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_xor_b32_e32 v1, 0x146f, v1
; VI-NEXT: v_xor_b32_e32 v0, 0xdf77987f, v0
diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
index f9137b0..af50e09 100644
--- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
@@ -53,7 +53,7 @@ define amdgpu_kernel void @s_cmp_zext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i
; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], 0xffff{{$}}
; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], 0xffff{{$}}
-; GCN: s_cmp_eq_u32 [[MASK_A]], [[B]]
+; GCN: s_cmp_eq_u32 [[MASK_A]], [[MASK_B]]
; GCN: s_cselect_b64 [[CC:s\[[0-9:]+\]]], -1, 0
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
; GCN: buffer_store_short [[RESULT]]